From d1c4a9b6fb00c715c17c1e91b29e70577fe9ba04 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 10 Jun 2026 10:30:29 +0200 Subject: [PATCH 01/22] WIP using rerun specific table options --- .../java/org/apache/datafusion/DataFrame.java | 23 + .../apache/datafusion/RerunTableOptions.java | 166 ++ .../org/apache/datafusion/SessionContext.java | 74 + native/Cargo.lock | 2345 ++++++++++++++++- native/Cargo.toml | 15 + native/build.rs | 1 + native/src/lib.rs | 30 + native/src/rerun_provider.rs | 318 +++ proto/rerun_table_options.proto | 52 + 9 files changed, 2896 insertions(+), 128 deletions(-) create mode 100644 core/src/main/java/org/apache/datafusion/RerunTableOptions.java create mode 100644 native/src/rerun_provider.rs create mode 100644 proto/rerun_table_options.proto diff --git a/core/src/main/java/org/apache/datafusion/DataFrame.java b/core/src/main/java/org/apache/datafusion/DataFrame.java index d4e0226..c9d1183 100644 --- a/core/src/main/java/org/apache/datafusion/DataFrame.java +++ b/core/src/main/java/org/apache/datafusion/DataFrame.java @@ -230,6 +230,27 @@ public DataFrame filter(String predicate) { return new DataFrame(filterRows(nativeHandle, predicate)); } + /** + * Apply a DataFusion-proto {@code LogicalExprNode} as a filter to this DataFrame. The bytes must + * be a serialized {@code datafusion.LogicalExprNode} (see {@code + * org.apache.datafusion.protobuf.LogicalExprNode}). Used by the Spark connector to push V2 {@code + * Predicate}s as proto-encoded expressions (sibling of {@link #filter(String)} for the structured + * wire path). + * + * @throws IllegalStateException if this context is closed. + * @throws RuntimeException if the bytes are not a valid {@code LogicalExprNode}, the expression + * references unknown columns/UDFs, or filter construction fails. + */ + public DataFrame filterFromProto(byte[] exprProtoBytes) { + if (nativeHandle == 0) { + throw new IllegalStateException("DataFrame is closed or already collected"); + } + if (exprProtoBytes == null) { + throw new IllegalArgumentException("filterFromProto exprProtoBytes must be non-null"); + } + return new DataFrame(filterFromProto(nativeHandle, exprProtoBytes)); + } + /** * Take the first {@code fetch} rows. Equivalent to {@link #limit(int, int)} with {@code skip = * 0}. The receiver remains usable and must still be closed independently. @@ -805,6 +826,8 @@ public void close() { private static native long filterRows(long handle, String predicate); + private static native long filterFromProto(long handle, byte[] exprProtoBytes); + private static native long limitRows(long handle, int skip, int fetch); private static native long distinctRows(long handle); diff --git a/core/src/main/java/org/apache/datafusion/RerunTableOptions.java b/core/src/main/java/org/apache/datafusion/RerunTableOptions.java new file mode 100644 index 0000000..d2389a4 --- /dev/null +++ b/core/src/main/java/org/apache/datafusion/RerunTableOptions.java @@ -0,0 +1,166 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datafusion; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * Setup parameters for constructing a Rerun {@code TableProvider} and registering it on a {@link + * SessionContext}. Mirrors the Python construction chain {@code + * CatalogClient(url).get_dataset(name).filter_segments([...]).reader(index=...)}: every field here + * is "Class 1" (author-set, not query-derived) — pushdown filters and projection are negotiated + * separately by the Spark connector via DataFusion-proto {@code LogicalPlanNode}s submitted through + * {@link SessionContext#fromProto}. + * + *

EXACTLY ONE of {@link Builder#datasetName(String)} or {@link Builder#datasetId(String)} must + * be set. + */ +public final class RerunTableOptions { + + private final String url; + private final String datasetName; + private final String datasetId; + private final List segments; + private final String index; + private final String token; + + private RerunTableOptions(Builder b) { + if (b.url == null || b.url.isEmpty()) { + throw new IllegalArgumentException("RerunTableOptions.url must be non-empty"); + } + if ((b.datasetName == null || b.datasetName.isEmpty()) + == (b.datasetId == null || b.datasetId.isEmpty())) { + // Both unset or both set — either way is ambiguous. + throw new IllegalArgumentException( + "RerunTableOptions: exactly one of datasetName or datasetId must be set"); + } + this.url = b.url; + this.datasetName = b.datasetName == null ? "" : b.datasetName; + this.datasetId = b.datasetId == null ? "" : b.datasetId; + this.segments = + b.segments == null ? Collections.emptyList() : Collections.unmodifiableList(b.segments); + this.index = b.index == null ? "" : b.index; + this.token = b.token == null ? "" : b.token; + } + + public static Builder builder() { + return new Builder(); + } + + public String url() { + return url; + } + + public String datasetName() { + return datasetName; + } + + public String datasetId() { + return datasetId; + } + + public List segments() { + return segments; + } + + public String index() { + return index; + } + + /** + * Return a copy with {@code segments} replaced — used by the Spark connector to narrow to one + * segment per executor task. + */ + public RerunTableOptions withSegments(List segments) { + Builder b = new Builder(); + b.url = this.url; + b.datasetName = this.datasetName.isEmpty() ? null : this.datasetName; + b.datasetId = this.datasetId.isEmpty() ? null : this.datasetId; + b.segments = segments == null ? null : new ArrayList<>(segments); + b.index = this.index.isEmpty() ? null : this.index; + b.token = this.token.isEmpty() ? null : this.token; + return new RerunTableOptions(b); + } + + /** + * Serialize as the {@code RerunTableOptions} protobuf consumed by the JNI bridge. Public so the + * Spark connector can ship the bytes through Java serialization to executors (executors + * deserialize them on their own {@link SessionContext} via {@link + * SessionContext#registerRerunTable}). + */ + public byte[] toProtoBytes() { + org.apache.datafusion.protobuf.RerunTableOptions.Builder b = + org.apache.datafusion.protobuf.RerunTableOptions.newBuilder() + .setUrl(url) + .setDatasetName(datasetName) + .setDatasetId(datasetId) + .setIndex(index) + .setToken(token); + b.addAllSegments(segments); + return b.build().toByteArray(); + } + + public static final class Builder { + private String url; + private String datasetName; + private String datasetId; + private List segments; + private String index; + private String token; + + public Builder url(String url) { + this.url = url; + return this; + } + + public Builder datasetName(String name) { + this.datasetName = name; + return this; + } + + public Builder datasetId(String id) { + this.datasetId = id; + return this; + } + + public Builder segments(List segments) { + this.segments = segments == null ? null : new ArrayList<>(segments); + return this; + } + + /** Timeline name to set as the query's {@code filtered_index}. */ + public Builder index(String index) { + this.index = index; + return this; + } + + /** Bearer JWT token. If unset, falls back to stored credentials / {@code REDAP_TOKEN} env. */ + public Builder token(String token) { + this.token = token; + return this; + } + + public RerunTableOptions build() { + return new RerunTableOptions(this); + } + } +} diff --git a/core/src/main/java/org/apache/datafusion/SessionContext.java b/core/src/main/java/org/apache/datafusion/SessionContext.java index ffc58dd..9006a26 100644 --- a/core/src/main/java/org/apache/datafusion/SessionContext.java +++ b/core/src/main/java/org/apache/datafusion/SessionContext.java @@ -570,6 +570,75 @@ public void registerUdf(ScalarUdf udf) { * context is closed. * @throws RuntimeException if native registration fails. */ + /** + * Register a Rerun dataset as a {@link org.apache.datafusion.protobuf.RerunTableOptions table} + * named {@code name}. Schema discovery and gRPC connection setup happen on the calling thread + * (synchronously). Subsequent {@link #sql} / {@link #fromProto} queries that reference {@code + * name} drive the embedded Rerun {@code TableProvider} directly — no extra JNI calls per scan. + * + *

Push-down for filter and projection is negotiated by sending a {@code LogicalPlanNode} + * through {@link #fromProto}: the JVM side encodes {@code TableScan(name) + Filter + Projection} + * referencing this registered table, and the Rust executor executes it natively. + * + * @throws IllegalStateException if this context is closed. + * @throws RuntimeException if native registration fails (network, auth, schema, etc.). + */ + /** + * Raw-bytes overload of {@link #registerRerunTable(String, RerunTableOptions)}. Used by the Spark + * connector on the executor side: the driver serializes a {@link RerunTableOptions} once via + * {@link RerunTableOptions#toProtoBytes()} and ships the bytes through Java serialization, + * skipping a POJO round-trip. + * + *

{@code optionsProtoBytes} must be a serialized {@code datafusion_java.RerunTableOptions} + * proto (see {@code rerun_table_options.proto}). + * + * @throws IllegalStateException if this context is closed. + * @throws RuntimeException if native registration fails (network, auth, schema, etc.). + */ + public void registerRerunTable(String name, byte[] optionsProtoBytes) { + if (nativeHandle == 0) { + throw new IllegalStateException("SessionContext is closed"); + } + if (name == null || name.isEmpty()) { + throw new IllegalArgumentException("registerRerunTable name must be non-empty"); + } + if (optionsProtoBytes == null) { + throw new IllegalArgumentException("registerRerunTable optionsProtoBytes must be non-null"); + } + registerRerunTableNative(nativeHandle, name, optionsProtoBytes); + } + + public void registerRerunTable(String name, RerunTableOptions options) { + if (nativeHandle == 0) { + throw new IllegalStateException("SessionContext is closed"); + } + if (name == null || name.isEmpty()) { + throw new IllegalArgumentException("registerRerunTable name must be non-empty"); + } + if (options == null) { + throw new IllegalArgumentException("registerRerunTable options must be non-null"); + } + registerRerunTableNative(nativeHandle, name, options.toProtoBytes()); + } + + /** + * Enumerate segment ids for the dataset described by {@code options}. Spark uses this on the + * driver to plan one input partition per segment; on the executor, the connector re-builds the + * same options with {@link RerunTableOptions#withSegments} narrowed to a single segment and calls + * {@link #registerRerunTable}. + * + *

This is logically a static operation — it makes its own gRPC connection and does not need a + * {@link SessionContext}. Exposed as a static method on this class so all native entry points + * live together. + */ + public static String[] listRerunSegments(RerunTableOptions options) { + if (options == null) { + throw new IllegalArgumentException("listRerunSegments options must be non-null"); + } + String[] result = listRerunSegmentsNative(options.toProtoBytes()); + return result == null ? new String[0] : result; + } + public void registerTable(String name, TableProvider provider) { if (nativeHandle == 0) { throw new IllegalStateException("SessionContext is closed"); @@ -664,4 +733,9 @@ private static native void registerScalarUdf( private static native void registerTableNative( long handle, String name, byte[] schemaIpcBytes, TableProvider provider); + + private static native void registerRerunTableNative( + long handle, String name, byte[] optionsProtoBytes); + + private static native String[] listRerunSegmentsNative(byte[] optionsProtoBytes); } diff --git a/native/Cargo.lock b/native/Cargo.lock index 8c56280..78ce6b9 100644 --- a/native/Cargo.lock +++ b/native/Cargo.lock @@ -61,6 +61,56 @@ dependencies = [ "libc", ] +[[package]] +name = "anstream" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" + +[[package]] +name = "anstyle-parse" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.61.2", +] + [[package]] name = "anyhow" version = "1.0.102" @@ -89,8 +139,8 @@ dependencies = [ "serde_bytes", "serde_json", "snap", - "strum", - "strum_macros", + "strum 0.27.2", + "strum_macros 0.27.2", "thiserror 2.0.18", "uuid", "zstd", @@ -105,6 +155,12 @@ dependencies = [ "object", ] +[[package]] +name = "array-init" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d62b7694a562cdf5a74227903507c56ab2cc8bdd1f781ed5cb4cf9c9f810bfc" + [[package]] name = "arrayref" version = "0.3.9" @@ -119,9 +175,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "607e64bb911ee4f90483e044fe78f175989148c2892e659a2cd25429e782ec54" +checksum = "378530e55cd479eda3c14eb345310799717e6f76d0c332041e8487022166b471" dependencies = [ "arrow-arith", "arrow-array", @@ -140,9 +196,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e754319ed8a85d817fe7adf183227e0b5308b82790a737b426c1124626b48118" +checksum = "a0ab212d2c1886e802f51c5212d78ebbcbb0bec980fff9dadc1eb8d45cd0b738" dependencies = [ "arrow-array", "arrow-buffer", @@ -154,9 +210,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "841321891f247aa86c6112c80d83d89cb36e0addd020fa2425085b8eb6c3f579" +checksum = "cfd33d3e92f207444098c75b42de99d329562be0cf686b307b097cc52b4e999e" dependencies = [ "ahash", "arrow-buffer", @@ -173,9 +229,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f955dfb73fae000425f49c8226d2044dab60fb7ad4af1e24f961756354d996c9" +checksum = "0c6cd424c2693bcdbc150d843dc9d4d137dd2de4782ce6df491ad11a3a0416c0" dependencies = [ "bytes", "half", @@ -185,9 +241,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca5e686972523798f76bef355145bc1ae25a84c731e650268d31ab763c701663" +checksum = "4c5aefb56a2c02e9e2b30746241058b85f8983f0fcff2ba0c6d09006e1cded7f" dependencies = [ "arrow-array", "arrow-buffer", @@ -207,9 +263,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86c276756867fc8186ec380c72c290e6e3b23a1d4fb05df6b1d62d2e62666d48" +checksum = "e94e8cf7e517657a52b91ea1263acf38c4ca62a84655d72458a3359b12ab97de" dependencies = [ "arrow-array", "arrow-cast", @@ -222,9 +278,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db3b5846209775b6dc8056d77ff9a032b27043383dd5488abd0b663e265b9373" +checksum = "3c88210023a2bfee1896af366309a3028fc3bcbd6515fa29a7990ee1baa08ee0" dependencies = [ "arrow-buffer", "arrow-schema", @@ -235,9 +291,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd8907ddd8f9fbabf91ec2c85c1d81fe2874e336d2443eb36373595e28b98dd5" +checksum = "238438f0834483703d88896db6fe5a7138b2230debc31b34c0336c2996e3c64f" dependencies = [ "arrow-array", "arrow-buffer", @@ -251,9 +307,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4518c59acc501f10d7dcae397fe12b8db3d81bc7de94456f8a58f9165d6f502" +checksum = "205ca2119e6d679d5c133c6f30e68f027738d95ed948cf77677ea69c7800036b" dependencies = [ "arrow-array", "arrow-buffer", @@ -276,9 +332,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "efa70d9d6b1356f1fb9f1f651b84a725b7e0abb93f188cf7d31f14abfa2f2e6f" +checksum = "1bffd8fd2579286a5d63bac898159873e5094a79009940bcb42bbfce4f19f1d0" dependencies = [ "arrow-array", "arrow-buffer", @@ -289,9 +345,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "faec88a945338192beffbbd4be0def70135422930caa244ac3cec0cd213b26b4" +checksum = "bab5994731204603c73ba69267616c50f80780774c6bb0476f1f830625115e0c" dependencies = [ "arrow-array", "arrow-buffer", @@ -302,9 +358,9 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18aa020f6bc8e5201dcd2d4b7f98c68f8a410ef37128263243e6ff2a47a67d4f" +checksum = "f633dbfdf39c039ada1bf9e34c694816eb71fbb7dc78f613993b7245e078a1ed" dependencies = [ "bitflags", "serde_core", @@ -313,9 +369,9 @@ dependencies = [ [[package]] name = "arrow-select" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a657ab5132e9c8ca3b24eb15a823d0ced38017fe3930ff50167466b02e2d592c" +checksum = "8cd065c54172ac787cf3f2f8d4107e0d3fdc26edba76fdf4f4cc170258942222" dependencies = [ "ahash", "arrow-array", @@ -327,9 +383,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6de2efbbd1a9f9780ceb8d1ff5d20421b35863b361e3386b4f571f1fc69fcb8" +checksum = "29dd7cda3ab9692f43a2e4acc444d760cc17b12bb6d8232ddf64e9bab7c06b42" dependencies = [ "arrow-array", "arrow-buffer", @@ -342,6 +398,24 @@ dependencies = [ "regex-syntax", ] +[[package]] +name = "ascii" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d92bec98840b8f03a5ff5413de5293bfcd8bf96467cf5452609f939ec6f5de16" + +[[package]] +name = "async-channel" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "924ed96dd52d1b75e9c1a3e6275715fd320f5f9439fb5a4a11fa51f4221158d2" +dependencies = [ + "concurrent-queue", + "event-listener-strategy", + "futures-core", + "pin-project-lite", +] + [[package]] name = "async-compression" version = "0.4.42" @@ -365,6 +439,28 @@ dependencies = [ "syn", ] +[[package]] +name = "async-stream" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476" +dependencies = [ + "async-stream-impl", + "futures-core", + "pin-project-lite", +] + +[[package]] +name = "async-stream-impl" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "async-trait" version = "0.1.89" @@ -397,6 +493,86 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +[[package]] +name = "aws-lc-rs" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ec2f1fc3ec205783a5da9a7e6c1509cc69dedf09a1949e412c1e18469326d00" +dependencies = [ + "aws-lc-sys", + "zeroize", +] + +[[package]] +name = "aws-lc-sys" +version = "0.41.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a2f9779ce85b93ab6170dd940ad0169b5766ff848247aff13bb788b832fe3f4" +dependencies = [ + "cc", + "cmake", + "dunce", + "fs_extra", +] + +[[package]] +name = "axum" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31b698c5f9a010f6573133b09e0de5408834d0c82f8d7475a89fc1867a71cd90" +dependencies = [ + "axum-core", + "bytes", + "form_urlencoded", + "futures-util", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-util", + "itoa", + "matchit", + "memchr", + "mime", + "percent-encoding", + "pin-project-lite", + "serde_core", + "serde_json", + "serde_path_to_error", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "axum-core" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08c78f31d7b1291f7ee735c1c6780ccde7785daae9a9206026862dab7d8792d1" +dependencies = [ + "bytes", + "futures-core", + "http", + "http-body", + "http-body-util", + "mime", + "pin-project-lite", + "sync_wrapper", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "az" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be5eb007b7cacc6c660343e96f650fedf4b5a77512399eb952ca6642cf8d13f7" + [[package]] name = "base64" version = "0.22.1" @@ -422,6 +598,9 @@ name = "bitflags" version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" +dependencies = [ + "bytemuck", +] [[package]] name = "blake2" @@ -507,6 +686,26 @@ version = "3.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" +[[package]] +name = "bytemuck" +version = "1.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec" +dependencies = [ + "bytemuck_derive", +] + +[[package]] +name = "bytemuck_derive" +version = "1.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9abbd1bc6865053c427f7198e6af43bfdedc55ab791faed4fbd361d789575ff" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "byteorder" version = "1.5.0" @@ -528,6 +727,12 @@ dependencies = [ "libbz2-rs-sys", ] +[[package]] +name = "camino" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e629a66d692cb9ff1a1c664e41771b3dcaf961985a9774c0eb0bd1b51cf60a48" + [[package]] name = "cc" version = "1.2.62" @@ -546,6 +751,17 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c" +[[package]] +name = "cfb" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d38f2da7a0a2c4ccf0065be06397cc26a81f4e528be095826eee9d4adbb8c60f" +dependencies = [ + "byteorder", + "fnv", + "uuid", +] + [[package]] name = "cfg-if" version = "1.0.4" @@ -576,8 +792,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0" dependencies = [ "iana-time-zone", + "js-sys", "num-traits", "serde", + "wasm-bindgen", "windows-link", ] @@ -588,9 +806,61 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6139a8597ed92cf816dfb33f5dd6cf0bb93a6adc938f11039f371bc5bcd26c3" dependencies = [ "chrono", - "phf", + "phf 0.12.1", +] + +[[package]] +name = "chunked_transfer" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e4de3bc4ea267985becf712dc6d9eed8b04c953b3fcfb339ebc87acd9804901" + +[[package]] +name = "clap" +version = "4.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", ] +[[package]] +name = "clap_lex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" + +[[package]] +name = "clean-path" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aaa6b4b263a5d737e9bf6b7c09b72c41a5480aec4d7219af827f6564e950b6a5" + [[package]] name = "cmake" version = "0.1.58" @@ -600,6 +870,12 @@ dependencies = [ "cc", ] +[[package]] +name = "colorchoice" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" + [[package]] name = "combine" version = "4.6.7" @@ -616,6 +892,7 @@ version = "7.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "958c5d6ecf1f214b4c2bbbbf6ab9523a864bd136dcf71a7e8904799acfe1ad47" dependencies = [ + "crossterm", "unicode-segmentation", "unicode-width", ] @@ -641,6 +918,15 @@ version = "0.4.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cc14f565cf027a105f7a44ccf9e5b424348421a1d8952a8fc9d499d313107789" +[[package]] +name = "concurrent-queue" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ca0197aee26d1ae37445ee532fefce43251d24cc7c166799f4d46817f1d3973" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "const-random" version = "0.1.18" @@ -667,6 +953,15 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" +[[package]] +name = "convert_case" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "affbf0190ed2caf063e3def54ff444b449371d55c58e513a95ab98eca50adb49" +dependencies = [ + "unicode-segmentation", +] + [[package]] name = "core-foundation" version = "0.10.1" @@ -710,12 +1005,85 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "crossbeam" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1137cd7e7fc0fb5d3c5a8678be38ec56e819125d8d7907411fe24ccb943faca8" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-epoch", + "crossbeam-queue", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-queue" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f58bbc28f91df819d0aa2a2c00cd19754769c2fad90579b3592b1c9ba7a3115" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-utils" version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" +[[package]] +name = "crossterm" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8b9f2e4c67f833b660cdb0a3523065869fb35570177239812ed4c905aeff87b" +dependencies = [ + "bitflags", + "crossterm_winapi", + "document-features", + "parking_lot", + "rustix", + "winapi", +] + +[[package]] +name = "crossterm_winapi" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acdd7c62a3665c7f6830a51635d9ac9b23ed385797f70a83bb8bafe9c572ab2b" +dependencies = [ + "winapi", +] + [[package]] name = "crunchy" version = "0.2.4" @@ -841,7 +1209,7 @@ dependencies = [ "datafusion-sql", "flate2", "futures", - "itertools", + "itertools 0.14.0", "liblzma", "log", "object_store", @@ -875,7 +1243,7 @@ dependencies = [ "datafusion-physical-plan", "datafusion-session", "futures", - "itertools", + "itertools 0.14.0", "log", "object_store", "parking_lot", @@ -900,7 +1268,7 @@ dependencies = [ "datafusion-physical-expr-common", "datafusion-physical-plan", "futures", - "itertools", + "itertools 0.14.0", "log", "object_store", ] @@ -919,7 +1287,7 @@ dependencies = [ "half", "hashbrown 0.16.1", "indexmap", - "itertools", + "itertools 0.14.0", "libc", "log", "object_store", @@ -966,7 +1334,7 @@ dependencies = [ "flate2", "futures", "glob", - "itertools", + "itertools 0.14.0", "liblzma", "log", "object_store", @@ -996,7 +1364,7 @@ dependencies = [ "datafusion-physical-plan", "datafusion-session", "futures", - "itertools", + "itertools 0.14.0", "object_store", "tokio", ] @@ -1090,7 +1458,7 @@ dependencies = [ "datafusion-pruning", "datafusion-session", "futures", - "itertools", + "itertools 0.14.0", "log", "object_store", "parking_lot", @@ -1143,7 +1511,7 @@ dependencies = [ "datafusion-functions-window-common", "datafusion-physical-expr-common", "indexmap", - "itertools", + "itertools 0.14.0", "paste", "recursive", "serde_json", @@ -1159,7 +1527,7 @@ dependencies = [ "arrow", "datafusion-common", "indexmap", - "itertools", + "itertools 0.14.0", "paste", ] @@ -1183,7 +1551,7 @@ dependencies = [ "datafusion-expr-common", "datafusion-macros", "hex", - "itertools", + "itertools 0.14.0", "log", "md-5", "memchr", @@ -1249,7 +1617,7 @@ dependencies = [ "datafusion-macros", "datafusion-physical-expr-common", "hashbrown 0.16.1", - "itertools", + "itertools 0.14.0", "itoa", "log", "paste", @@ -1314,6 +1682,15 @@ dependencies = [ "prost", "prost-build", "protoc-bin-vendored", + "re_auth", + "re_dataframe", + "re_datafusion", + "re_log_types", + "re_protos", + "re_redap_client", + "re_types_core", + "re_uri", + "rustls", "tokio", "tokio-metrics", "url", @@ -1343,7 +1720,7 @@ dependencies = [ "datafusion-expr-common", "datafusion-physical-expr", "indexmap", - "itertools", + "itertools 0.14.0", "log", "recursive", "regex", @@ -1366,7 +1743,7 @@ dependencies = [ "half", "hashbrown 0.16.1", "indexmap", - "itertools", + "itertools 0.14.0", "parking_lot", "paste", "petgraph", @@ -1386,7 +1763,7 @@ dependencies = [ "datafusion-functions", "datafusion-physical-expr", "datafusion-physical-expr-common", - "itertools", + "itertools 0.14.0", ] [[package]] @@ -1402,7 +1779,7 @@ dependencies = [ "datafusion-expr-common", "hashbrown 0.16.1", "indexmap", - "itertools", + "itertools 0.14.0", "parking_lot", ] @@ -1421,7 +1798,7 @@ dependencies = [ "datafusion-physical-expr-common", "datafusion-physical-plan", "datafusion-pruning", - "itertools", + "itertools 0.14.0", "recursive", ] @@ -1449,7 +1826,7 @@ dependencies = [ "half", "hashbrown 0.16.1", "indexmap", - "itertools", + "itertools 0.14.0", "log", "num-traits", "parking_lot", @@ -1509,7 +1886,7 @@ dependencies = [ "datafusion-physical-expr", "datafusion-physical-expr-common", "datafusion-physical-plan", - "itertools", + "itertools 0.14.0", "log", ] @@ -1557,7 +1934,7 @@ dependencies = [ "chrono", "datafusion", "half", - "itertools", + "itertools 0.14.0", "object_store", "pbjson-types", "prost", @@ -1578,28 +1955,93 @@ dependencies = [ ] [[package]] -name = "displaydoc" -version = "0.2.5" +name = "directories" +version = "6.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +checksum = "16f5094c54661b38d03bd7e50df373292118db60b585c08a411c6d840017fe7d" dependencies = [ - "proc-macro2", + "dirs-sys", +] + +[[package]] +name = "dirs-sys" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e01a3366d27ee9890022452ee61b2b63a67e6f13f58900b651ff5665f0bb1fab" +dependencies = [ + "libc", + "option-ext", + "redox_users", + "windows-sys 0.61.2", +] + +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", "quote", "syn", ] +[[package]] +name = "document-features" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4b8a88685455ed29a21542a33abd9cb6510b6b129abadabdcef0f4c55bc8f61" +dependencies = [ + "litrs", +] + +[[package]] +name = "dtoa" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c3cf4824e2d5f025c7b531afcb2325364084a16806f6d47fbc1f5fbd9960590" + +[[package]] +name = "dunce" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" + [[package]] name = "dyn-clone" version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555" +[[package]] +name = "ehttp" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2f1b93eb2e039aaff63ce07cca59bd1dca02f2ce30075a17b619d2c42f56efc" +dependencies = [ + "async-channel", + "document-features", + "js-sys", + "serde", + "serde_json", + "ureq", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + [[package]] name = "either" version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +[[package]] +name = "emath" +version = "0.34.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b53f0d33a479321da6b0caa71366c9f67e8a2c149762d90bdc0d16e601ee8ecb" + [[package]] name = "equivalent" version = "1.0.2" @@ -1616,6 +2058,27 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "event-listener" +version = "5.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13b66accf52311f30a0db42147dadea9850cb48cd070028831ae5f5d4b856ab" +dependencies = [ + "concurrent-queue", + "parking", + "pin-project-lite", +] + +[[package]] +name = "event-listener-strategy" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8be9f3dfaaffdae2972880079a491a1a8bb7cbed0b8dd7a347f668b4150a3b93" +dependencies = [ + "event-listener", + "pin-project-lite", +] + [[package]] name = "fastrand" version = "2.4.1" @@ -1628,6 +2091,19 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" +[[package]] +name = "fixed" +version = "1.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9af2cbf772fa6d1c11358f92ef554cb6b386201210bcf0e91fb7fba8a907fb40" +dependencies = [ + "az", + "bytemuck", + "half", + "serde", + "typenum", +] + [[package]] name = "fixedbitset" version = "0.5.7" @@ -1682,6 +2158,12 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "fs_extra" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" + [[package]] name = "futures" version = "0.3.32" @@ -1852,6 +2334,7 @@ version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" dependencies = [ + "bytemuck", "cfg-if", "crunchy", "num-traits", @@ -1902,6 +2385,15 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest", +] + [[package]] name = "http" version = "1.4.0" @@ -1941,6 +2433,12 @@ version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" +[[package]] +name = "httpdate" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + [[package]] name = "humantime" version = "2.3.0" @@ -1961,6 +2459,7 @@ dependencies = [ "http", "http-body", "httparse", + "httpdate", "itoa", "pin-project-lite", "smallvec", @@ -1982,6 +2481,20 @@ dependencies = [ "tokio", "tokio-rustls", "tower-service", + "webpki-roots", +] + +[[package]] +name = "hyper-timeout" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0" +dependencies = [ + "hyper", + "hyper-util", + "pin-project-lite", + "tokio", + "tower-service", ] [[package]] @@ -2146,6 +2659,12 @@ dependencies = [ "icu_properties", ] +[[package]] +name = "indent" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9f1a0777d972970f204fdf8ef319f1f4f8459131636d7e3c96c5d59570d0fa6" + [[package]] name = "indexmap" version = "2.14.0" @@ -2158,6 +2677,15 @@ dependencies = [ "serde_core", ] +[[package]] +name = "infer" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc150e5ce2330295b8616ce0e3f53250e53af31759a9dbedad1621ba29151847" +dependencies = [ + "cfb", +] + [[package]] name = "integer-encoding" version = "3.0.4" @@ -2170,6 +2698,21 @@ version = "2.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2" +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + [[package]] name = "itertools" version = "0.14.0" @@ -2185,6 +2728,49 @@ version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" +[[package]] +name = "jiff" +version = "0.2.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4603d3033e49e2b0e31229fcab20a5d40089c607d975cd9c80551dc69eed9102" +dependencies = [ + "jiff-static", + "jiff-tzdb-platform", + "js-sys", + "log", + "portable-atomic", + "portable-atomic-util", + "serde_core", + "wasm-bindgen", + "windows-link", +] + +[[package]] +name = "jiff-static" +version = "0.2.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "782d32378dddf207193ac91cefb848ad41abb58195c95168e1291227a0832b47" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "jiff-tzdb" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c900ef84826f1338a557697dc8fc601df9ca9af4ac137c7fb61d4c6f2dfd3076" + +[[package]] +name = "jiff-tzdb-platform" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "875a5a69ac2bab1a891711cf5eccbec1ce0341ea805560dcd90b7a2e925132e8" +dependencies = [ + "jiff-tzdb", +] + [[package]] name = "jni" version = "0.21.1" @@ -2251,6 +2837,27 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "jsonwebtoken" +version = "10.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eba32bfb4ffdeaca3e34431072faf01745c9b26d25504aa7a6cf5684334fc4fc" +dependencies = [ + "base64", + "getrandom 0.2.17", + "js-sys", + "serde", + "serde_json", + "signature", + "zeroize", +] + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + [[package]] name = "leb128fmt" version = "0.1.0" @@ -2352,6 +2959,15 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" +[[package]] +name = "libredox" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f02ab6bace2054fb888a3c16f990117b579d14a3088e472d63c6011fa185c9d3" +dependencies = [ + "libc", +] + [[package]] name = "linux-raw-sys" version = "0.12.1" @@ -2364,6 +2980,12 @@ version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0" +[[package]] +name = "litrs" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11d3d7f243d5c5a8b9bb5d6dd2b1602c0cb0b9db1621bafc7ed66e35ff9fe092" + [[package]] name = "lock_api" version = "0.4.14" @@ -2371,6 +2993,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" dependencies = [ "scopeguard", + "serde", ] [[package]] @@ -2379,6 +3002,15 @@ version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" +[[package]] +name = "log-once" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d8a05e3879b317b1b6dbf353e5bba7062bedcc59815267bb23eaa0c576cebf0" +dependencies = [ + "log", +] + [[package]] name = "lru-slab" version = "0.1.2" @@ -2394,6 +3026,31 @@ dependencies = [ "twox-hash", ] +[[package]] +name = "matchers" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" +dependencies = [ + "regex-automata", +] + +[[package]] +name = "matchit" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3" + +[[package]] +name = "matrixmultiply" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06de3016e9fae57a36fd14dba131fccf49f74b40b7fbdb472f96e361ec71a08" +dependencies = [ + "autocfg", + "rawpointer", +] + [[package]] name = "md-5" version = "0.10.6" @@ -2410,6 +3067,34 @@ version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +[[package]] +name = "memory-stats" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c73f5c649995a115e1a0220b35e4df0a1294500477f97a91d0660fb5abeb574a" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + +[[package]] +name = "mime_guess2" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1706dc14a2e140dec0a7a07109d9a3d5890b81e85bd6c60b906b249a77adf0ca" +dependencies = [ + "mime", + "phf 0.11.3", + "phf_shared 0.11.3", + "unicase", +] + [[package]] name = "miniz_oxide" version = "0.8.9" @@ -2437,6 +3122,42 @@ version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" +[[package]] +name = "natord" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "308d96db8debc727c3fd9744aac51751243420e46edf401010908da7f8d5e57c" + +[[package]] +name = "ndarray" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "882ed72dce9365842bf196bdeedf5055305f11fc8c03dee7bb0194a6cad34841" +dependencies = [ + "matrixmultiply", + "num-complex", + "num-integer", + "num-traits", + "portable-atomic", + "portable-atomic-util", + "rawpointer", +] + +[[package]] +name = "nohash-hasher" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bf50223579dc7cdcfb3bfcacf7069ff68243f8c363f62ffa99cf000a6b9c451" + +[[package]] +name = "nu-ansi-term" +version = "0.50.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" +dependencies = [ + "windows-sys 0.61.2", +] + [[package]] name = "num-bigint" version = "0.4.6" @@ -2457,6 +3178,17 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-derive" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "num-integer" version = "0.1.46" @@ -2503,7 +3235,7 @@ dependencies = [ "http-body-util", "humantime", "hyper", - "itertools", + "itertools 0.14.0", "md-5", "parking_lot", "percent-encoding", @@ -2530,6 +3262,12 @@ version = "1.21.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + [[package]] name = "openssl-probe" version = "0.2.1" @@ -2537,33 +3275,134 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" [[package]] -name = "ordered-float" -version = "2.10.1" +name = "opentelemetry" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c" +checksum = "b84bcd6ae87133e903af7ef497404dda70c60d0ea14895fc8a5e6722754fc2a0" dependencies = [ - "num-traits", + "futures-core", + "futures-sink", + "js-sys", + "pin-project-lite", + "thiserror 2.0.18", + "tracing", ] [[package]] -name = "parking_lot" -version = "0.12.5" +name = "opentelemetry-appender-tracing" +version = "0.31.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" +checksum = "ef6a1ac5ca3accf562b8c306fa8483c85f4390f768185ab775f242f7fe8fdcc2" dependencies = [ - "lock_api", - "parking_lot_core", + "opentelemetry", + "tracing", + "tracing-core", + "tracing-opentelemetry", + "tracing-subscriber", ] [[package]] -name = "parking_lot_core" -version = "0.9.12" +name = "opentelemetry-http" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" +checksum = "d7a6d09a73194e6b66df7c8f1b680f156d916a1a942abf2de06823dd02b7855d" dependencies = [ - "cfg-if", - "libc", - "redox_syscall", + "async-trait", + "bytes", + "http", + "opentelemetry", + "reqwest", +] + +[[package]] +name = "opentelemetry-otlp" +version = "0.31.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f69cd6acbb9af919df949cd1ec9e5e7fdc2ef15d234b6b795aaa525cc02f71f" +dependencies = [ + "http", + "opentelemetry", + "opentelemetry-http", + "opentelemetry-proto", + "opentelemetry_sdk", + "prost", + "reqwest", + "thiserror 2.0.18", + "tokio", + "tonic", + "tracing", +] + +[[package]] +name = "opentelemetry-proto" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7175df06de5eaee9909d4805a3d07e28bb752c34cab57fa9cff549da596b30f" +dependencies = [ + "opentelemetry", + "opentelemetry_sdk", + "prost", + "tonic", + "tonic-prost", +] + +[[package]] +name = "opentelemetry_sdk" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e14ae4f5991976fd48df6d843de219ca6d31b01daaab2dad5af2badeded372bd" +dependencies = [ + "futures-channel", + "futures-executor", + "futures-util", + "opentelemetry", + "percent-encoding", + "rand 0.9.4", + "thiserror 2.0.18", + "tokio", + "tokio-stream", +] + +[[package]] +name = "option-ext" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" + +[[package]] +name = "ordered-float" +version = "2.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c" +dependencies = [ + "num-traits", +] + +[[package]] +name = "parking" +version = "2.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba" + +[[package]] +name = "parking_lot" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", "smallvec", "windows-link", ] @@ -2627,7 +3466,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af22d08a625a2213a78dbb0ffa253318c5c79ce3133d32d296655a7bdfb02095" dependencies = [ "heck", - "itertools", + "itertools 0.14.0", "prost", "prost-types", ] @@ -2647,6 +3486,33 @@ dependencies = [ "serde", ] +[[package]] +name = "peg" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0aad070be5b63aa72103f2fcdd70a83adbd5e90112ce5b574171ff1c65501773" +dependencies = [ + "peg-macros", + "peg-runtime", +] + +[[package]] +name = "peg-macros" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd8ef6825cae95355031ae26a99b616a2a21f22ba2de0197c43dfb05acbe7ee" +dependencies = [ + "peg-runtime", + "proc-macro2", + "quote", +] + +[[package]] +name = "peg-runtime" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7011d97b484a5ebdc4b1fdb3b12d5e4bbbea56e9d22b688f2e79e04b65a7d8a6" + [[package]] name = "percent-encoding" version = "2.3.2" @@ -2665,13 +3531,57 @@ dependencies = [ "serde", ] +[[package]] +name = "phf" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" +dependencies = [ + "phf_macros", + "phf_shared 0.11.3", +] + [[package]] name = "phf" version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "913273894cec178f401a31ec4b656318d95473527be05c0752cc41cdc32be8b7" dependencies = [ - "phf_shared", + "phf_shared 0.12.1", +] + +[[package]] +name = "phf_generator" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" +dependencies = [ + "phf_shared 0.11.3", + "rand 0.8.6", +] + +[[package]] +name = "phf_macros" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216" +dependencies = [ + "phf_generator", + "phf_shared 0.11.3", + "proc-macro2", + "quote", + "syn", + "unicase", +] + +[[package]] +name = "phf_shared" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" +dependencies = [ + "siphasher", + "unicase", ] [[package]] @@ -2683,6 +3593,26 @@ dependencies = [ "siphasher", ] +[[package]] +name = "pin-project" +version = "1.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2466b2336ed02bcdca6b294417127b90ec92038d1d5c4fbeac971a922e0e0924" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c96395f0a926bc13b1c17622aaddda1ecb55d49c8f1bf9777e4d877800a43f8b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "pin-project-lite" version = "0.2.17" @@ -2695,6 +3625,32 @@ version = "0.3.33" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e" +[[package]] +name = "ply-rs-bw" +version = "3.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe55bbee2b70d1c1e58d8340eda9a80c5ce11fb9b1bc10b5fc1575c490d38fa9" +dependencies = [ + "byteorder", + "indexmap", + "peg", +] + +[[package]] +name = "portable-atomic" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" + +[[package]] +name = "portable-atomic-util" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a106d1259c23fac8e543272398ae0e3c0b8d33c88ed73d0cc71b0f1d902618" +dependencies = [ + "portable-atomic", +] + [[package]] name = "potential_utf" version = "0.1.5" @@ -2723,6 +3679,15 @@ dependencies = [ "syn", ] +[[package]] +name = "proc-macro-crate" +version = "3.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e67ba7e9b2b56446f1d419b1d807906278ffa1a658a8a5d8a39dcb1f5a78614f" +dependencies = [ + "toml_edit", +] + [[package]] name = "proc-macro2" version = "1.0.106" @@ -2732,6 +3697,29 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "prometheus-client" +version = "0.24.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cca3d75b4566b9a29fe1ed623587fb058e826eb329a0be4b7c4da1ebb2d7a6ca" +dependencies = [ + "dtoa", + "itoa", + "parking_lot", + "prometheus-client-derive-encode", +] + +[[package]] +name = "prometheus-client-derive-encode" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9adf1691c04c0a5ff46ff8f262b58beb07b0dbb61f96f9f54f6cbd82106ed87f" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "prost" version = "0.14.3" @@ -2749,7 +3737,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7" dependencies = [ "heck", - "itertools", + "itertools 0.14.0", "log", "multimap", "petgraph", @@ -2768,7 +3756,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" dependencies = [ "anyhow", - "itertools", + "itertools 0.14.0", "proc-macro2", "quote", "syn", @@ -2866,6 +3854,20 @@ dependencies = [ "cc", ] +[[package]] +name = "puffin" +version = "0.19.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa9dae7b05c02ec1a6bc9bcf20d8bc64a7dcbf57934107902a872014899b741f" +dependencies = [ + "anyhow", + "byteorder", + "cfg-if", + "itertools 0.10.5", + "once_cell", + "parking_lot", +] + [[package]] name = "quad-rand" version = "0.2.3" @@ -2958,6 +3960,15 @@ version = "6.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" +[[package]] +name = "rand" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ca0ecfa931c29007047d1bc58e623ab12e5590e8c7cc53200d5202b69266d8a" +dependencies = [ + "rand_core 0.6.4", +] + [[package]] name = "rand" version = "0.9.4" @@ -2989,6 +4000,15 @@ dependencies = [ "rand_core 0.9.5", ] +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom 0.2.17", +] + [[package]] name = "rand_core" version = "0.9.5" @@ -3005,80 +4025,761 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "63b8176103e19a2643978565ca18b50549f6101881c443590420e4dc998a3c69" [[package]] -name = "recursive" -version = "0.1.1" +name = "rawpointer" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e" -dependencies = [ - "recursive-proc-macro-impl", - "stacker", -] +checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" [[package]] -name = "recursive-proc-macro-impl" -version = "0.1.1" +name = "rayon" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" +checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d" dependencies = [ - "quote", - "syn", + "either", + "rayon-core", ] [[package]] -name = "redox_syscall" -version = "0.5.18" +name = "rayon-core" +version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" dependencies = [ - "bitflags", + "crossbeam-deque", + "crossbeam-utils", ] [[package]] -name = "regex" -version = "1.12.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +name = "re_analytics" +version = "0.34.0-alpha.1+dev" dependencies = [ - "aho-corasick", - "memchr", - "regex-automata", - "regex-syntax", + "crossbeam", + "directories", + "ehttp", + "jiff", + "re_build_info", + "re_log", + "re_quota_channel", + "serde", + "serde_json", + "sha2", + "thiserror 2.0.18", + "url", + "uuid", + "web-sys", ] [[package]] -name = "regex-automata" -version = "0.4.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +name = "re_arrow_util" +version = "0.34.0-alpha.1+dev" dependencies = [ - "aho-corasick", - "memchr", - "regex-syntax", + "anyhow", + "arrow", + "comfy-table", + "half", + "itertools 0.14.0", + "re_log", + "re_tracing", + "re_tuid", + "serde_json", + "thiserror 2.0.18", ] [[package]] -name = "regex-lite" -version = "0.1.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cab834c73d247e67f4fae452806d17d3c7501756d98c8808d7c9c7aa7d18f973" - -[[package]] -name = "regex-syntax" -version = "0.8.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" - -[[package]] -name = "regress" -version = "0.10.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2057b2325e68a893284d1538021ab90279adac1139957ca2a74426c6f118fb48" +name = "re_auth" +version = "0.34.0-alpha.1+dev" dependencies = [ - "hashbrown 0.16.1", - "memchr", -] - -[[package]] + "async-trait", + "base64", + "directories", + "ehttp", + "getrandom 0.2.17", + "getrandom 0.3.4", + "hmac", + "http", + "jiff", + "js-sys", + "jsonwebtoken", + "parking_lot", + "rand 0.9.4", + "re_analytics", + "re_log", + "ring", + "saturating_cast", + "serde", + "serde_json", + "sha2", + "signature", + "thiserror 2.0.18", + "tiny_http", + "tokio", + "tonic", + "tower", + "url", + "uuid", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "re_backoff" +version = "0.34.0-alpha.1+dev" +dependencies = [ + "getrandom 0.3.4", + "js-sys", + "rand 0.9.4", + "tokio", + "wasm-bindgen-futures", + "web-sys", +] + +[[package]] +name = "re_build_info" +version = "0.34.0-alpha.1+dev" +dependencies = [ + "re_byte_size", + "serde", +] + +[[package]] +name = "re_byte_size" +version = "0.34.0-alpha.1+dev" +dependencies = [ + "arrow", + "half", + "parking_lot", + "re_byte_size_derive", + "smallvec", + "vec1", +] + +[[package]] +name = "re_byte_size_derive" +version = "0.34.0-alpha.1+dev" +dependencies = [ + "proc-macro-crate", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "re_case" +version = "0.34.0-alpha.1+dev" +dependencies = [ + "convert_case", +] + +[[package]] +name = "re_chunk" +version = "0.34.0-alpha.1+dev" +dependencies = [ + "ahash", + "anyhow", + "arrow", + "bytemuck", + "crossbeam", + "document-features", + "half", + "itertools 0.14.0", + "nohash-hasher", + "rand 0.9.4", + "re_arrow_util", + "re_byte_size", + "re_error", + "re_format", + "re_log", + "re_log_types", + "re_quota_channel", + "re_sorbet", + "re_span", + "re_tracing", + "re_types_core", + "thiserror 2.0.18", +] + +[[package]] +name = "re_chunk_store" +version = "0.34.0-alpha.1+dev" +dependencies = [ + "ahash", + "anyhow", + "arrow", + "document-features", + "indent", + "itertools 0.14.0", + "nohash-hasher", + "parking_lot", + "re_arrow_util", + "re_byte_size", + "re_chunk", + "re_format", + "re_log", + "re_log_encoding", + "re_log_types", + "re_sdk_types", + "re_sorbet", + "re_tracing", + "re_types_core", + "saturating_cast", + "thiserror 2.0.18", + "web-time", +] + +[[package]] +name = "re_dataframe" +version = "0.34.0-alpha.1+dev" +dependencies = [ + "anyhow", + "arrow", + "itertools 0.14.0", + "nohash-hasher", + "rayon", + "re_arrow_util", + "re_chunk", + "re_chunk_store", + "re_log", + "re_log_types", + "re_query", + "re_sorbet", + "re_span", + "re_tracing", + "re_types_core", + "tracing", +] + +[[package]] +name = "re_datafusion" +version = "0.34.0-alpha.1+dev" +dependencies = [ + "ahash", + "arrow", + "async-stream", + "async-trait", + "chrono", + "datafusion", + "futures", + "futures-util", + "getrandom 0.3.4", + "http", + "itertools 0.14.0", + "jiff", + "opentelemetry", + "opentelemetry-proto", + "parking_lot", + "re_analytics", + "re_arrow_util", + "re_backoff", + "re_byte_size", + "re_dataframe", + "re_format", + "re_log", + "re_log_encoding", + "re_log_types", + "re_perf_telemetry", + "re_protos", + "re_redap_client", + "re_sorbet", + "re_tracing", + "re_types_core", + "re_uri", + "reqwest", + "tokio", + "tokio-stream", + "tonic", + "tonic-prost", + "tracing", + "wasm-bindgen-futures", + "web-time", +] + +[[package]] +name = "re_error" +version = "0.34.0-alpha.1+dev" + +[[package]] +name = "re_format" +version = "0.34.0-alpha.1+dev" +dependencies = [ + "half", + "itertools 0.14.0", + "num-traits", + "re_log", +] + +[[package]] +name = "re_grpc_headers" +version = "0.34.0-alpha.1+dev" +dependencies = [ + "http", + "pin-project-lite", + "tonic", + "tower", +] + +[[package]] +name = "re_log" +version = "0.34.0-alpha.1+dev" +dependencies = [ + "crossbeam", + "log", + "log-once", + "parking_lot", + "tracing", + "tracing-log", + "tracing-subscriber", + "tracing-web", +] + +[[package]] +name = "re_log_channel" +version = "0.34.0-alpha.1+dev" +dependencies = [ + "camino", + "crossbeam", + "futures", + "parking_lot", + "re_byte_size", + "re_log_encoding", + "re_log_types", + "re_quota_channel", + "re_tracing", + "re_uri", + "serde", + "thiserror 2.0.18", +] + +[[package]] +name = "re_log_encoding" +version = "0.34.0-alpha.1+dev" +dependencies = [ + "arrow", + "bytes", + "crossbeam", + "itertools 0.14.0", + "lz4_flex", + "parking_lot", + "re_arrow_util", + "re_build_info", + "re_byte_size", + "re_chunk", + "re_log", + "re_log_types", + "re_protos", + "re_quota_channel", + "re_sorbet", + "re_span", + "re_tracing", + "re_types_core", + "sha2", + "thiserror 2.0.18", + "tokio", + "tokio-stream", + "tracing", + "xxhash-rust", +] + +[[package]] +name = "re_log_types" +version = "0.34.0-alpha.1+dev" +dependencies = [ + "ahash", + "arrow", + "bytemuck", + "clean-path", + "document-features", + "fixed", + "half", + "itertools 0.14.0", + "jiff", + "natord", + "nohash-hasher", + "num-derive", + "num-traits", + "parking_lot", + "re_arrow_util", + "re_build_info", + "re_byte_size", + "re_format", + "re_log", + "re_string_interner", + "re_tracing", + "re_tuid", + "re_types_core", + "serde", + "static_assertions", + "thiserror 2.0.18", + "typenum", + "uuid", + "web-time", + "xxhash-rust", +] + +[[package]] +name = "re_perf_telemetry" +version = "0.34.0-alpha.1+dev" +dependencies = [ + "ahash", + "anyhow", + "axum", + "base64", + "clap", + "http", + "memory-stats", + "opentelemetry", + "opentelemetry-appender-tracing", + "opentelemetry-http", + "opentelemetry-otlp", + "opentelemetry_sdk", + "parking_lot", + "prometheus-client", + "rand 0.9.4", + "re_auth", + "re_grpc_headers", + "serde", + "serde_json", + "tokio", + "tonic", + "tower", + "tower-http", + "tracing", + "tracing-opentelemetry", + "tracing-subscriber", +] + +[[package]] +name = "re_protos" +version = "0.34.0-alpha.1+dev" +dependencies = [ + "arrow", + "http", + "itertools 0.14.0", + "jiff", + "lz4_flex", + "opentelemetry", + "prost", + "prost-types", + "re_arrow_util", + "re_build_info", + "re_byte_size", + "re_chunk", + "re_grpc_headers", + "re_log_types", + "re_sorbet", + "re_tracing", + "re_tuid", + "re_types_core", + "serde", + "thiserror 2.0.18", + "tonic", + "tonic-prost", + "tower", + "tracing", + "url", +] + +[[package]] +name = "re_query" +version = "0.34.0-alpha.1+dev" +dependencies = [ + "ahash", + "anyhow", + "arrow", + "indent", + "itertools 0.14.0", + "nohash-hasher", + "parking_lot", + "paste", + "re_byte_size", + "re_chunk", + "re_chunk_store", + "re_error", + "re_format", + "re_log", + "re_log_types", + "re_tracing", + "re_types_core", + "seq-macro", + "thiserror 2.0.18", +] + +[[package]] +name = "re_quota_channel" +version = "0.34.0-alpha.1+dev" +dependencies = [ + "crossbeam", + "parking_lot", + "re_byte_size", + "re_format", + "re_log", +] + +[[package]] +name = "re_redap_client" +version = "0.34.0-alpha.1+dev" +dependencies = [ + "ahash", + "arrow", + "ehttp", + "futures", + "itertools 0.14.0", + "jiff", + "opentelemetry", + "re_arrow_util", + "re_auth", + "re_backoff", + "re_byte_size", + "re_chunk", + "re_error", + "re_format", + "re_log", + "re_log_channel", + "re_log_encoding", + "re_log_types", + "re_protos", + "re_tracing", + "re_types_core", + "re_uri", + "serde", + "thiserror 2.0.18", + "tokio", + "tokio-stream", + "tonic", + "tonic-web-wasm-client", + "tower", + "tracing", + "url", + "web-time", +] + +[[package]] +name = "re_rvl" +version = "0.34.0-alpha.1+dev" +dependencies = [ + "byteorder", + "thiserror 2.0.18", +] + +[[package]] +name = "re_sdk_types" +version = "0.34.0-alpha.1+dev" +dependencies = [ + "array-init", + "arrow", + "bytemuck", + "document-features", + "emath", + "half", + "indexmap", + "infer", + "itertools 0.14.0", + "mime_guess2", + "ndarray", + "nohash-hasher", + "ply-rs-bw", + "re_byte_size", + "re_error", + "re_format", + "re_log", + "re_log_types", + "re_rvl", + "re_sorbet", + "re_tracing", + "re_types_core", + "serde", + "smallvec", + "thiserror 2.0.18", + "uuid", +] + +[[package]] +name = "re_sorbet" +version = "0.34.0-alpha.1+dev" +dependencies = [ + "arrow", + "itertools 0.14.0", + "nohash-hasher", + "re_arrow_util", + "re_byte_size", + "re_log", + "re_log_types", + "re_tracing", + "re_tuid", + "re_types_core", + "semver", + "strum 0.26.3", + "thiserror 2.0.18", + "tracing", + "web-time", +] + +[[package]] +name = "re_span" +version = "0.34.0-alpha.1+dev" +dependencies = [ + "num-traits", +] + +[[package]] +name = "re_string_interner" +version = "0.34.0-alpha.1+dev" +dependencies = [ + "ahash", + "nohash-hasher", + "parking_lot", + "re_byte_size", + "serde", + "static_assertions", +] + +[[package]] +name = "re_tracing" +version = "0.34.0-alpha.1+dev" +dependencies = [ + "parking_lot", + "puffin", +] + +[[package]] +name = "re_tuid" +version = "0.34.0-alpha.1+dev" +dependencies = [ + "bytemuck", + "document-features", + "getrandom 0.3.4", + "re_byte_size", + "re_log", + "serde", + "web-time", +] + +[[package]] +name = "re_types_core" +version = "0.34.0-alpha.1+dev" +dependencies = [ + "anyhow", + "arrow", + "bitflags", + "bytemuck", + "document-features", + "half", + "itertools 0.14.0", + "nohash-hasher", + "re_arrow_util", + "re_byte_size", + "re_case", + "re_error", + "re_log", + "re_string_interner", + "re_tracing", + "re_tuid", + "serde", + "thiserror 2.0.18", +] + +[[package]] +name = "re_uri" +version = "0.34.0-alpha.1+dev" +dependencies = [ + "percent-encoding", + "re_byte_size", + "re_log", + "re_log_types", + "re_tuid", + "re_types_core", + "serde", + "static_assertions", + "thiserror 2.0.18", + "url", +] + +[[package]] +name = "recursive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e" +dependencies = [ + "recursive-proc-macro-impl", + "stacker", +] + +[[package]] +name = "recursive-proc-macro-impl" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" +dependencies = [ + "quote", + "syn", +] + +[[package]] +name = "redox_syscall" +version = "0.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" +dependencies = [ + "bitflags", +] + +[[package]] +name = "redox_users" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4e608c6638b9c18977b00b475ac1f28d14e84b27d8d42f70e0bf1e3dec127ac" +dependencies = [ + "getrandom 0.2.17", + "libredox", + "thiserror 2.0.18", +] + +[[package]] +name = "regex" +version = "1.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-lite" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cab834c73d247e67f4fae452806d17d3c7501756d98c8808d7c9c7aa7d18f973" + +[[package]] +name = "regex-syntax" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" + +[[package]] +name = "regress" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2057b2325e68a893284d1538021ab90279adac1139957ca2a74426c6f118fb48" +dependencies = [ + "hashbrown 0.16.1", + "memchr", +] + +[[package]] name = "reqwest" version = "0.12.28" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -3086,6 +4787,7 @@ checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" dependencies = [ "base64", "bytes", + "futures-channel", "futures-core", "futures-util", "h2", @@ -3118,6 +4820,7 @@ dependencies = [ "wasm-bindgen-futures", "wasm-streams", "web-sys", + "webpki-roots", ] [[package]] @@ -3168,6 +4871,8 @@ version = "0.23.40" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef86cd5876211988985292b91c96a8f2d298df24e75989a43a3c73f2d4d8168b" dependencies = [ + "aws-lc-rs", + "log", "once_cell", "ring", "rustls-pki-types", @@ -3204,6 +4909,7 @@ version = "0.103.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "61c429a8649f110dddef65e2a5ad240f747e85f7758a6bccc7e5777bd33f756e" dependencies = [ + "aws-lc-rs", "ring", "rustls-pki-types", "untrusted", @@ -3230,6 +4936,12 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "saturating_cast" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fc4972f129a0ea378b69fa7c186d63255606e362ad00795f00b869dea5265eb" + [[package]] name = "schannel" version = "0.1.29" @@ -3366,10 +5078,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" dependencies = [ "itoa", - "memchr", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "serde_path_to_error" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10a9ff822e371bb5403e391ecd83e182e0e77ba7f6fe0160b795797109d1b457" +dependencies = [ + "itoa", "serde", "serde_core", - "zmij", ] [[package]] @@ -3420,12 +5143,30 @@ dependencies = [ "digest", ] +[[package]] +name = "sharded-slab" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" +dependencies = [ + "lazy_static", +] + [[package]] name = "shlex" version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +[[package]] +name = "signature" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de" +dependencies = [ + "rand_core 0.6.4", +] + [[package]] name = "simd-adler32" version = "0.3.9" @@ -3513,18 +5254,46 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + [[package]] name = "strsim" version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" +[[package]] +name = "strum" +version = "0.26.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" +dependencies = [ + "strum_macros 0.26.4", +] + [[package]] name = "strum" version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf" +[[package]] +name = "strum_macros" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "rustversion", + "syn", +] + [[package]] name = "strum_macros" version = "0.27.2" @@ -3653,6 +5422,15 @@ dependencies = [ "syn", ] +[[package]] +name = "thread_local" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185" +dependencies = [ + "cfg-if", +] + [[package]] name = "thrift" version = "0.17.0" @@ -3673,6 +5451,18 @@ dependencies = [ "crunchy", ] +[[package]] +name = "tiny_http" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "389915df6413a2e74fb181895f933386023c71110878cd0825588928e64cdc82" +dependencies = [ + "ascii", + "chunked_transfer", + "httpdate", + "log", +] + [[package]] name = "tinystr" version = "0.8.3" @@ -3771,6 +5561,103 @@ dependencies = [ "tokio", ] +[[package]] +name = "toml_datetime" +version = "1.1.1+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3165f65f62e28e0115a00b2ebdd37eb6f3b641855f9d636d3cd4103767159ad7" +dependencies = [ + "serde_core", +] + +[[package]] +name = "toml_edit" +version = "0.25.12+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2153edc6955a6c354fad8f5efd38b6a8769bdccf9fe50f8e1329f81b0baa5d7" +dependencies = [ + "indexmap", + "toml_datetime", + "toml_parser", + "winnow", +] + +[[package]] +name = "toml_parser" +version = "1.1.2+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2abe9b86193656635d2411dc43050282ca48aa31c2451210f4202550afb7526" +dependencies = [ + "winnow", +] + +[[package]] +name = "tonic" +version = "0.14.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac2a5518c70fa84342385732db33fb3f44bc4cc748936eb5833d2df34d6445ef" +dependencies = [ + "async-trait", + "base64", + "bytes", + "flate2", + "h2", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-timeout", + "hyper-util", + "percent-encoding", + "pin-project", + "rustls-native-certs", + "socket2", + "sync_wrapper", + "tokio", + "tokio-rustls", + "tokio-stream", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tonic-prost" +version = "0.14.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50849f68853be452acf590cde0b146665b8d507b3b8af17261df47e02c209ea0" +dependencies = [ + "bytes", + "prost", + "tonic", +] + +[[package]] +name = "tonic-web-wasm-client" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "898cd44be5e23e59d2956056538f1d6b3c5336629d384ffd2d92e76f87fb98ff" +dependencies = [ + "base64", + "byteorder", + "bytes", + "futures-util", + "http", + "http-body", + "http-body-util", + "httparse", + "js-sys", + "pin-project", + "thiserror 2.0.18", + "tonic", + "tower-service", + "wasm-bindgen", + "wasm-bindgen-futures", + "wasm-streams", + "web-sys", +] + [[package]] name = "tower" version = "0.5.3" @@ -3779,11 +5666,15 @@ checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" dependencies = [ "futures-core", "futures-util", + "indexmap", "pin-project-lite", + "slab", "sync_wrapper", "tokio", + "tokio-util", "tower-layer", "tower-service", + "tracing", ] [[package]] @@ -3801,6 +5692,7 @@ dependencies = [ "tower", "tower-layer", "tower-service", + "tracing", "url", ] @@ -3822,6 +5714,7 @@ version = "0.1.44" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" dependencies = [ + "log", "pin-project-lite", "tracing-attributes", "tracing-core", @@ -3845,6 +5738,78 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" dependencies = [ "once_cell", + "valuable", +] + +[[package]] +name = "tracing-log" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" +dependencies = [ + "log", + "once_cell", + "tracing-core", +] + +[[package]] +name = "tracing-opentelemetry" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ac28f2d093c6c477eaa76b23525478f38de514fa9aeb1285738d4b97a9552fc" +dependencies = [ + "js-sys", + "opentelemetry", + "smallvec", + "tracing", + "tracing-core", + "tracing-log", + "tracing-subscriber", + "web-time", +] + +[[package]] +name = "tracing-serde" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704b1aeb7be0d0a84fc9828cae51dab5970fee5088f83d1dd7ee6f6246fc6ff1" +dependencies = [ + "serde", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.3.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb7f578e5945fb242538965c2d0b04418d38ec25c79d160cd279bf0731c8d319" +dependencies = [ + "matchers", + "nu-ansi-term", + "once_cell", + "regex-automata", + "serde", + "serde_json", + "sharded-slab", + "smallvec", + "thread_local", + "tracing", + "tracing-core", + "tracing-log", + "tracing-serde", +] + +[[package]] +name = "tracing-web" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e6a141feebd51f8d91ebfd785af50fca223c570b86852166caa3b141defe7c" +dependencies = [ + "js-sys", + "tracing-core", + "tracing-subscriber", + "wasm-bindgen", + "web-sys", ] [[package]] @@ -3912,6 +5877,12 @@ dependencies = [ "typify-impl", ] +[[package]] +name = "unicase" +version = "2.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142" + [[package]] name = "unicode-ident" version = "1.0.24" @@ -3948,6 +5919,35 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" +[[package]] +name = "ureq" +version = "3.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dea7109cdcd5864d4eeb1b58a1648dc9bf520360d7af16ec26d0a9354bafcfc0" +dependencies = [ + "base64", + "flate2", + "log", + "percent-encoding", + "rustls", + "rustls-pki-types", + "ureq-proto", + "utf8-zero", + "webpki-roots", +] + +[[package]] +name = "ureq-proto" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e994ba84b0bd1b1b0cf92878b7ef898a5c1760108fe7b6010327e274917a808c" +dependencies = [ + "base64", + "http", + "httparse", + "log", +] + [[package]] name = "url" version = "2.5.8" @@ -3958,14 +5958,27 @@ dependencies = [ "idna", "percent-encoding", "serde", + "serde_derive", ] +[[package]] +name = "utf8-zero" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8c0a043c9540bae7c578c88f91dda8bd82e59ae27c21baca69c8b191aaf5a6e" + [[package]] name = "utf8_iter" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + [[package]] name = "uuid" version = "1.23.1" @@ -3978,6 +5991,22 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "valuable" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" + +[[package]] +name = "vec1" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eab68b56840f69efb0fefbe3ab6661499217ffdc58e2eef7c3f6f69835386322" +dependencies = [ + "serde", + "smallvec", +] + [[package]] name = "version_check" version = "0.9.5" @@ -4149,6 +6178,31 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "webpki-roots" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52f5ee44c96cf55f1b349600768e3ece3a8f26010c05265ab73f945bb1a2eb9d" +dependencies = [ + "rustls-pki-types", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + [[package]] name = "winapi-util" version = "0.1.11" @@ -4158,6 +6212,12 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + [[package]] name = "windows-core" version = "0.62.2" @@ -4439,6 +6499,15 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" +[[package]] +name = "winnow" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0592e1c9d151f854e6fd382574c3a0855250e1d9b2f99d9281c6e6391af352f1" +dependencies = [ + "memchr", +] + [[package]] name = "wit-bindgen" version = "0.51.0" @@ -4539,6 +6608,12 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" +[[package]] +name = "xxhash-rust" +version = "0.8.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" + [[package]] name = "yoke" version = "0.8.2" @@ -4608,6 +6683,20 @@ name = "zeroize" version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" +dependencies = [ + "zeroize_derive", +] + +[[package]] +name = "zeroize_derive" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85a5b4158499876c763cb03bc4e49185d3cccbabb15b33c627f7884f43db852e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] [[package]] name = "zerotrie" diff --git a/native/Cargo.toml b/native/Cargo.toml index c462408..9ca2a39 100644 --- a/native/Cargo.toml +++ b/native/Cargo.toml @@ -81,6 +81,21 @@ jni = "0.21" object_store = { version = "0.13", default-features = false } prost = "0.14" tokio = { version = "1", features = ["rt-multi-thread"] } + +# Rerun provider crates. Linked directly into the cdylib so the embedded +# DataFusion `SessionContext` can register the `TableProvider` via the +# native `ctx.register_table(...)` path — no `datafusion-ffi` boundary. +re_datafusion = { path = "../../reality/rerun/crates/store/re_datafusion" } +re_redap_client = { path = "../../reality/rerun/crates/store/re_redap_client" } +re_dataframe = { path = "../../reality/rerun/crates/store/re_dataframe" } +re_log_types = { path = "../../reality/rerun/crates/store/re_log_types" } +re_protos = { path = "../../reality/rerun/crates/store/re_protos" } +re_types_core = { path = "../../reality/rerun/crates/store/re_types_core" } +re_uri = { path = "../../reality/rerun/crates/store/re_uri" } +re_auth = { path = "../../reality/rerun/crates/utils/re_auth" } +# Direct rustls dep so we can install the default crypto provider once at +# startup — required by the rerun TLS stack (see rerun_py catalog_client.rs). +rustls = "0.23" # Tokio runtime metrics. Optional + cfg-gated: this crate's API surface lives # behind `--cfg tokio_unstable`, so enabling the `runtime-metrics` feature also # requires the caller to set `RUSTFLAGS="--cfg tokio_unstable"` at build time. diff --git a/native/build.rs b/native/build.rs index d292514..dd3918e 100644 --- a/native/build.rs +++ b/native/build.rs @@ -28,6 +28,7 @@ fn main() { "../proto/json_write_options.proto", "../proto/object_store_options.proto", "../proto/parquet_read_options.proto", + "../proto/rerun_table_options.proto", ]; for p in PROTOS { println!("cargo:rerun-if-changed={p}"); diff --git a/native/src/lib.rs b/native/src/lib.rs index 4fd7a8a..d2745aa 100644 --- a/native/src/lib.rs +++ b/native/src/lib.rs @@ -25,6 +25,7 @@ mod json; mod memory; mod object_store; mod proto; +mod rerun_provider; mod runtime_metrics; mod schema; mod table_provider; @@ -535,6 +536,35 @@ pub extern "system" fn Java_org_apache_datafusion_DataFrame_filterRows<'local>( }) } +/// Decode a DataFusion-proto `LogicalExprNode` and apply it as a `Filter` to this DataFrame. +/// Used by the Spark connector to push V2 `Predicate`s as DataFusion `Expr` bytes (translated +/// JVM-side by `SparkPredicateTranslator`). +#[no_mangle] +pub extern "system" fn Java_org_apache_datafusion_DataFrame_filterFromProto<'local>( + mut env: JNIEnv<'local>, + _class: JClass<'local>, + handle: jlong, + expr_proto_bytes: JByteArray<'local>, +) -> jlong { + use datafusion_proto::logical_plan::from_proto::parse_expr; + use datafusion_proto::logical_plan::DefaultLogicalExtensionCodec; + use datafusion_proto::protobuf::LogicalExprNode; + + try_unwrap_or_throw(&mut env, 0, |env| -> JniResult { + if handle == 0 { + return Err("DataFrame handle is null".into()); + } + let df = unsafe { &*(handle as *const DataFrame) }.clone(); + let bytes: Vec = env.convert_byte_array(&expr_proto_bytes)?; + let node = LogicalExprNode::decode(bytes.as_slice())?; + let task_ctx = df.task_ctx(); + let extension_codec = DefaultLogicalExtensionCodec {}; + let expr = parse_expr(&node, &task_ctx, &extension_codec)?; + let new_df = df.filter(expr)?; + Ok(Box::into_raw(Box::new(new_df)) as jlong) + }) +} + #[no_mangle] pub extern "system" fn Java_org_apache_datafusion_DataFrame_limitRows<'local>( mut env: JNIEnv<'local>, diff --git a/native/src/rerun_provider.rs b/native/src/rerun_provider.rs new file mode 100644 index 0000000..bce3454 --- /dev/null +++ b/native/src/rerun_provider.rs @@ -0,0 +1,318 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Rerun `TableProvider` registration and segment enumeration JNI surface. +//! +//! Two JNI entry points used by the Spark connector. Scan + filter + projection +//! reuse the existing `createDataFrameFromProto` path (the JVM side encodes a +//! `LogicalPlanNode` referencing the registered table) so no new scan JNI is +//! introduced here. +//! +//! - `registerRerunTableNative`: decode a [`RerunTableOptions`] envelope, +//! construct a [`DataframeQueryTableProvider`] (does schema discovery + sets +//! up the gRPC connection), and register it on the embedded +//! [`SessionContext`] under the given name. +//! - `listRerunSegmentsNative`: enumerate segment ids for the dataset, used by +//! the Spark driver to plan one input partition per segment. + +use std::sync::{Arc, Once}; + +use datafusion::arrow::datatypes::{DataType, Schema as ArrowSchema, TimeUnit}; +use datafusion::catalog::TableProvider; +use datafusion::prelude::SessionContext; +use jni::objects::{JByteArray, JClass, JString}; +use jni::sys::{jlong, jobjectArray}; +use jni::JNIEnv; +use prost::Message; + +use re_datafusion::DataframeQueryTableProvider; +use re_dataframe::QueryExpression; +use re_log_types::EntryId; +use re_protos::cloud::v1alpha1::EntryFilter; +use re_redap_client::{ConnectionClient, ConnectionRegistry, ConnectionRegistryHandle, Credentials}; +use re_types_core::TimelineName; + +use crate::errors::{try_unwrap_or_throw, JniResult}; +use crate::proto_gen::RerunTableOptions; +use crate::runtime; + +/// Idempotent install of rustls's `ring` crypto provider. The rerun TLS stack +/// crashes at runtime if no default provider is installed; this used to be +/// done implicitly by `object_store` but rerun no longer pulls that in, so the +/// JNI bridge installs it explicitly on first use. +fn init_rustls_crypto() { + static ONCE: Once = Once::new(); + ONCE.call_once(|| { + // `install_default` returns Err when a provider has already been + // installed by another path; we don't care which one wins. + let _ = rustls::crypto::ring::default_provider().install_default(); + }); +} + +fn build_registry_handle( + origin: &re_uri::Origin, + token: &str, +) -> JniResult { + let handle = ConnectionRegistry::new_with_stored_credentials(); + let credentials = if token.is_empty() { + Credentials::Stored + } else { + let jwt = re_auth::Jwt::try_from(token.to_owned())?; + Credentials::Token(jwt) + }; + handle.set_credentials(origin, credentials); + Ok(handle) +} + +async fn resolve_entry_id( + handle: &ConnectionRegistryHandle, + origin: &re_uri::Origin, + options: &RerunTableOptions, +) -> JniResult { + if !options.dataset_id.is_empty() { + let id: EntryId = options + .dataset_id + .parse() + .map_err(|e: std::num::ParseIntError| -> Box { + format!("invalid Rerun dataset_id {:?}: {}", options.dataset_id, e).into() + })?; + return Ok(id); + } + if options.dataset_name.is_empty() { + return Err("RerunTableOptions: one of `dataset_id` or `dataset_name` must be set".into()); + } + let mut client = handle.client(origin.clone()).await?; + let entries = client + .find_entries(EntryFilter::new().with_name(options.dataset_name.clone())) + .await?; + let entry = entries.into_iter().next().ok_or_else( + || -> Box { + format!("no Rerun entry found with name {:?}", options.dataset_name).into() + }, + )?; + Ok(entry.id) +} + +fn build_query_expression(options: &RerunTableOptions) -> QueryExpression { + let mut qe = QueryExpression::default(); + if !options.index.is_empty() { + qe.filtered_index = Some(TimelineName::new(options.index.as_str())); + } + qe +} + +async fn build_provider( + options: RerunTableOptions, +) -> JniResult> { + init_rustls_crypto(); + let origin: re_uri::Origin = options + .url + .parse() + .map_err(|e: re_uri::Error| -> Box { + format!("invalid Rerun url {:?}: {}", options.url, e).into() + })?; + let handle = build_registry_handle(&origin, options.token.as_str())?; + let entry_id = resolve_entry_id(&handle, &origin, &options).await?; + let query_expr = build_query_expression(&options); + + let provider = DataframeQueryTableProvider::::new( + origin, + handle, + entry_id, + &query_expr, + options.segments.as_slice(), + None, + None, + None, + Vec::new(), + ) + .await?; + Ok(Arc::new(provider)) +} + +#[no_mangle] +pub extern "system" fn Java_org_apache_datafusion_SessionContext_registerRerunTableNative<'local>( + mut env: JNIEnv<'local>, + _class: JClass<'local>, + handle: jlong, + name: JString<'local>, + options_proto: JByteArray<'local>, +) { + try_unwrap_or_throw(&mut env, (), |env| -> JniResult<()> { + if handle == 0 { + return Err("SessionContext handle is null".into()); + } + // SAFETY: matches the existing `registerTableNative` pattern — handle + // came from `createSessionContext` as `Box` raw ptr. + let ctx = unsafe { &*(handle as *const SessionContext) }; + let name: String = env.get_string(&name)?.into(); + let bytes: Vec = env.convert_byte_array(&options_proto)?; + let options = RerunTableOptions::decode(bytes.as_slice())?; + + let provider = runtime().block_on(build_provider(options))?; + runtime().block_on(register_with_widening_view(ctx, name.as_str(), provider))?; + Ok(()) + }) +} + +/// Recursively compute the arrow_cast destination-type string for a column +/// whose Arrow type is not directly readable by Spark's `ArrowColumnVector`. +/// +/// Spark 3.5's `ArrowColumnVector` has no accessor for unsigned ints, Time, +/// or Float16. We use DataFusion's built-in `arrow_cast(col, '')` +/// (rather than SQL `CAST`) because it preserves nested structure — a +/// `List` becomes `List(Int32)` end-to-end, not just at the top +/// level. Returns `Some(target)` if any widening is needed, `None` if the +/// column passes through unchanged. +/// +/// Coverage: +/// - scalars: UInt8/16/32/64, Float16, Time32/64 +/// - List<...>, LargeList<...>, FixedSizeList<..., size> with a widenable +/// element type (handles the `item` field rejection at any nesting depth) +/// +/// NOT covered in v1: Struct<...> / Map<...> with widenable children. The +/// JVM schema converter still rejects those with the original error. +fn arrow_cast_widening(dt: &DataType) -> Option { + match dt { + DataType::UInt8 => Some("Int16".into()), + DataType::UInt16 => Some("Int32".into()), + DataType::UInt32 => Some("Int64".into()), + // UInt64 widening is lossy for values ≥ 2^63 — documented limitation. + DataType::UInt64 => Some("Int64".into()), + DataType::Float16 => Some("Float32".into()), + DataType::Time32(_) => Some("Int32".into()), + DataType::Time64(_) => Some("Int64".into()), + // Spark's ArrowColumnVector accepts only Timestamp(Microsecond, ...). + // Other units cause `UNSUPPORTED_ARROWTYPE` at executor batch wrap. + // Cast all timestamps to microsecond precision, preserving the + // timezone string (None vs Some(tz)). + DataType::Timestamp(unit, tz) => { + if *unit == TimeUnit::Microsecond { + None + } else { + let tz_str = match tz { + None => "None".to_string(), + Some(s) => format!("Some(\"{}\")", s.replace('\\', "\\\\").replace('"', "\\\"")), + }; + Some(format!("Timestamp(Microsecond, {tz_str})")) + } + } + DataType::List(field) => { + arrow_cast_widening(field.data_type()).map(|t| format!("List({t})")) + } + DataType::LargeList(field) => { + arrow_cast_widening(field.data_type()).map(|t| format!("LargeList({t})")) + } + DataType::FixedSizeList(field, size) => arrow_cast_widening(field.data_type()) + .map(|t| format!("FixedSizeList({t}, {size})")), + _ => None, + } +} + +/// Register `provider` under `external_name`. If the provider's schema has any +/// fields that need a Spark-compatibility widen (see [`widen_sql_type`]), the +/// raw provider is stashed under a mangled name and `external_name` is +/// registered as a SQL view that casts the offending columns. v1 widens +/// top-level fields only — nested unsigned ints inside a Struct still surface +/// the original Arrow type and will fail in the JVM schema converter. +async fn register_with_widening_view( + ctx: &SessionContext, + external_name: &str, + provider: Arc, +) -> JniResult<()> { + let schema: Arc = provider.schema(); + let needs_view = schema + .fields() + .iter() + .any(|f| arrow_cast_widening(f.data_type()).is_some()); + + if !needs_view { + ctx.register_table(external_name, provider)?; + return Ok(()); + } + + let raw_name = format!("__rerun_raw__{external_name}"); + ctx.register_table(raw_name.as_str(), provider)?; + + let select_list = schema + .fields() + .iter() + .map(|f| { + let name = f.name(); + // Identifier quoting: double quotes; escape any embedded "". + let quoted = format!("\"{}\"", name.replace('"', "\"\"")); + match arrow_cast_widening(f.data_type()) { + // arrow_cast preserves nested structure (List → + // List(Int32)); SQL CAST would have to be List-of-scalar + // only and produce a different operator graph. + Some(target) => format!("arrow_cast({quoted}, '{target}') AS {quoted}"), + None => quoted, + } + }) + .collect::>() + .join(", "); + let sql = format!( + "SELECT {select_list} FROM \"{}\"", + raw_name.replace('"', "\"\"") + ); + + let df = ctx.sql(&sql).await?; + ctx.register_table(external_name, df.into_view())?; + Ok(()) +} + +#[no_mangle] +pub extern "system" fn Java_org_apache_datafusion_SessionContext_listRerunSegmentsNative<'local>( + mut env: JNIEnv<'local>, + _class: JClass<'local>, + options_proto: JByteArray<'local>, +) -> jobjectArray { + try_unwrap_or_throw( + &mut env, + std::ptr::null_mut(), + |env| -> JniResult { + let bytes: Vec = env.convert_byte_array(&options_proto)?; + let options = RerunTableOptions::decode(bytes.as_slice())?; + + init_rustls_crypto(); + let origin: re_uri::Origin = options.url.parse().map_err( + |e: re_uri::Error| -> Box { + format!("invalid Rerun url {:?}: {}", options.url, e).into() + }, + )?; + let handle = build_registry_handle(&origin, options.token.as_str())?; + + let segments: Vec = runtime().block_on(async { + let entry_id = resolve_entry_id(&handle, &origin, &options).await?; + let mut client = handle.client(origin.clone()).await?; + let raw = client.get_dataset_segment_ids(entry_id).await?; + Ok::, Box>( + raw.into_iter().map(|s| s.into_inner()).collect(), + ) + })?; + + let string_class = env.find_class("java/lang/String")?; + let empty = env.new_string("")?; + let arr = env.new_object_array(segments.len() as i32, &string_class, &empty)?; + for (i, s) in segments.iter().enumerate() { + let js = env.new_string(s)?; + env.set_object_array_element(&arr, i as i32, js)?; + } + Ok(arr.into_raw()) + }, + ) +} diff --git a/proto/rerun_table_options.proto b/proto/rerun_table_options.proto new file mode 100644 index 0000000..c5e8ee6 --- /dev/null +++ b/proto/rerun_table_options.proto @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +syntax = "proto3"; + +package datafusion_java; + +option java_package = "org.apache.datafusion.protobuf"; +option java_multiple_files = true; + +// Setup parameters for constructing a Rerun `TableProvider` and registering +// it into the embedded DataFusion `SessionContext`. These mirror the +// "Class 1" (author-set, not query-derived) parameters from the Spark +// connector design: every executor task receives the same envelope with +// `segments` narrowed to that task's single segment. +message RerunTableOptions { + // Rerun gRPC endpoint, e.g. "rerun+http://localhost:51234". + string url = 1; + + // EXACTLY ONE of `dataset_name` or `dataset_id` must be set. + // - `dataset_name`: resolved server-side via FindEntries(name=...). + // - `dataset_id`: parsed directly as a Tuid (skips the FindEntries RPC). + string dataset_name = 2; + string dataset_id = 3; + + // Optional explicit segment list. Empty → provider serves all segments + // (the driver-side `listRerunSegments` JNI is used by the Spark + // connector to enumerate before partitioning). + repeated string segments = 4; + + // Optional timeline name to set as `QueryExpression.filtered_index`. + // Empty → static-only query. + string index = 5; + + // Optional auth token (Bearer JWT). Empty → `Credentials::Stored` + // (looks up stored creds for this origin; falls back to REDAP_TOKEN). + string token = 6; +} From baa9ed33456934852a1911fd8401822cbe00b208 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 10 Jun 2026 13:37:26 +0200 Subject: [PATCH 02/22] WIP on making table providers spark data sources with a working example --- .../apache/datafusion/RerunTableOptions.java | 166 - .../org/apache/datafusion/SessionContext.java | 77 +- examples/README.md | 58 + examples/SPARK_INTEGRATION.md | 168 + examples/native/Cargo.lock | 3653 +++++++++++++++++ examples/native/Cargo.toml | 27 + examples/native/src/lib.rs | 130 + .../examples/FfiTableProviderExample.java | 89 + .../FfiTableProviderExampleNative.java | 111 + native/Cargo.lock | 2404 ++--------- native/Cargo.toml | 15 +- native/build.rs | 1 - native/src/ffi_table_provider.rs | 71 + native/src/lib.rs | 2 +- native/src/rerun_provider.rs | 318 -- proto/rerun_table_options.proto | 52 - 16 files changed, 4595 insertions(+), 2747 deletions(-) delete mode 100644 core/src/main/java/org/apache/datafusion/RerunTableOptions.java create mode 100644 examples/README.md create mode 100644 examples/SPARK_INTEGRATION.md create mode 100644 examples/native/Cargo.lock create mode 100644 examples/native/Cargo.toml create mode 100644 examples/native/src/lib.rs create mode 100644 examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExample.java create mode 100644 examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExampleNative.java create mode 100644 native/src/ffi_table_provider.rs delete mode 100644 native/src/rerun_provider.rs delete mode 100644 proto/rerun_table_options.proto diff --git a/core/src/main/java/org/apache/datafusion/RerunTableOptions.java b/core/src/main/java/org/apache/datafusion/RerunTableOptions.java deleted file mode 100644 index d2389a4..0000000 --- a/core/src/main/java/org/apache/datafusion/RerunTableOptions.java +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.datafusion; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -/** - * Setup parameters for constructing a Rerun {@code TableProvider} and registering it on a {@link - * SessionContext}. Mirrors the Python construction chain {@code - * CatalogClient(url).get_dataset(name).filter_segments([...]).reader(index=...)}: every field here - * is "Class 1" (author-set, not query-derived) — pushdown filters and projection are negotiated - * separately by the Spark connector via DataFusion-proto {@code LogicalPlanNode}s submitted through - * {@link SessionContext#fromProto}. - * - *

EXACTLY ONE of {@link Builder#datasetName(String)} or {@link Builder#datasetId(String)} must - * be set. - */ -public final class RerunTableOptions { - - private final String url; - private final String datasetName; - private final String datasetId; - private final List segments; - private final String index; - private final String token; - - private RerunTableOptions(Builder b) { - if (b.url == null || b.url.isEmpty()) { - throw new IllegalArgumentException("RerunTableOptions.url must be non-empty"); - } - if ((b.datasetName == null || b.datasetName.isEmpty()) - == (b.datasetId == null || b.datasetId.isEmpty())) { - // Both unset or both set — either way is ambiguous. - throw new IllegalArgumentException( - "RerunTableOptions: exactly one of datasetName or datasetId must be set"); - } - this.url = b.url; - this.datasetName = b.datasetName == null ? "" : b.datasetName; - this.datasetId = b.datasetId == null ? "" : b.datasetId; - this.segments = - b.segments == null ? Collections.emptyList() : Collections.unmodifiableList(b.segments); - this.index = b.index == null ? "" : b.index; - this.token = b.token == null ? "" : b.token; - } - - public static Builder builder() { - return new Builder(); - } - - public String url() { - return url; - } - - public String datasetName() { - return datasetName; - } - - public String datasetId() { - return datasetId; - } - - public List segments() { - return segments; - } - - public String index() { - return index; - } - - /** - * Return a copy with {@code segments} replaced — used by the Spark connector to narrow to one - * segment per executor task. - */ - public RerunTableOptions withSegments(List segments) { - Builder b = new Builder(); - b.url = this.url; - b.datasetName = this.datasetName.isEmpty() ? null : this.datasetName; - b.datasetId = this.datasetId.isEmpty() ? null : this.datasetId; - b.segments = segments == null ? null : new ArrayList<>(segments); - b.index = this.index.isEmpty() ? null : this.index; - b.token = this.token.isEmpty() ? null : this.token; - return new RerunTableOptions(b); - } - - /** - * Serialize as the {@code RerunTableOptions} protobuf consumed by the JNI bridge. Public so the - * Spark connector can ship the bytes through Java serialization to executors (executors - * deserialize them on their own {@link SessionContext} via {@link - * SessionContext#registerRerunTable}). - */ - public byte[] toProtoBytes() { - org.apache.datafusion.protobuf.RerunTableOptions.Builder b = - org.apache.datafusion.protobuf.RerunTableOptions.newBuilder() - .setUrl(url) - .setDatasetName(datasetName) - .setDatasetId(datasetId) - .setIndex(index) - .setToken(token); - b.addAllSegments(segments); - return b.build().toByteArray(); - } - - public static final class Builder { - private String url; - private String datasetName; - private String datasetId; - private List segments; - private String index; - private String token; - - public Builder url(String url) { - this.url = url; - return this; - } - - public Builder datasetName(String name) { - this.datasetName = name; - return this; - } - - public Builder datasetId(String id) { - this.datasetId = id; - return this; - } - - public Builder segments(List segments) { - this.segments = segments == null ? null : new ArrayList<>(segments); - return this; - } - - /** Timeline name to set as the query's {@code filtered_index}. */ - public Builder index(String index) { - this.index = index; - return this; - } - - /** Bearer JWT token. If unset, falls back to stored credentials / {@code REDAP_TOKEN} env. */ - public Builder token(String token) { - this.token = token; - return this; - } - - public RerunTableOptions build() { - return new RerunTableOptions(this); - } - } -} diff --git a/core/src/main/java/org/apache/datafusion/SessionContext.java b/core/src/main/java/org/apache/datafusion/SessionContext.java index 9006a26..86140a0 100644 --- a/core/src/main/java/org/apache/datafusion/SessionContext.java +++ b/core/src/main/java/org/apache/datafusion/SessionContext.java @@ -571,72 +571,33 @@ public void registerUdf(ScalarUdf udf) { * @throws RuntimeException if native registration fails. */ /** - * Register a Rerun dataset as a {@link org.apache.datafusion.protobuf.RerunTableOptions table} - * named {@code name}. Schema discovery and gRPC connection setup happen on the calling thread - * (synchronously). Subsequent {@link #sql} / {@link #fromProto} queries that reference {@code - * name} drive the embedded Rerun {@code TableProvider} directly — no extra JNI calls per scan. + * Register a TableProvider produced as an {@code FFI_TableProvider} pointer by Rust code on the + * far side of the FFI boundary. * - *

Push-down for filter and projection is negotiated by sending a {@code LogicalPlanNode} - * through {@link #fromProto}: the JVM side encodes {@code TableScan(name) + Filter + Projection} - * referencing this registered table, and the Rust executor executes it natively. + *

The pointer is the raw boxed address ({@code Box::into_raw(Box::new(FFI_TableProvider))}) + * returned by another cdylib's JNI entry point — typically a domain bridge (e.g. Rerun's + * createFfiProvider) followed by the connector-core widening helper. Ownership transfers in; the + * pointer must not be reused after this call. * - * @throws IllegalStateException if this context is closed. - * @throws RuntimeException if native registration fails (network, auth, schema, etc.). - */ - /** - * Raw-bytes overload of {@link #registerRerunTable(String, RerunTableOptions)}. Used by the Spark - * connector on the executor side: the driver serializes a {@link RerunTableOptions} once via - * {@link RerunTableOptions#toProtoBytes()} and ships the bytes through Java serialization, - * skipping a POJO round-trip. - * - *

{@code optionsProtoBytes} must be a serialized {@code datafusion_java.RerunTableOptions} - * proto (see {@code rerun_table_options.proto}). + *

Predicate pushdown and projection cross the FFI boundary as part of the standard + * datafusion-ffi protocol; no JVM-side TableProvider implementation runs. * * @throws IllegalStateException if this context is closed. - * @throws RuntimeException if native registration fails (network, auth, schema, etc.). + * @throws IllegalArgumentException if {@code name} is empty or {@code ffiTableProviderPtr} is 0. + * @throws RuntimeException if native registration fails. */ - public void registerRerunTable(String name, byte[] optionsProtoBytes) { + public void registerFfiTable(String name, long ffiTableProviderPtr) { if (nativeHandle == 0) { throw new IllegalStateException("SessionContext is closed"); } if (name == null || name.isEmpty()) { - throw new IllegalArgumentException("registerRerunTable name must be non-empty"); + throw new IllegalArgumentException("registerFfiTable name must be non-empty"); } - if (optionsProtoBytes == null) { - throw new IllegalArgumentException("registerRerunTable optionsProtoBytes must be non-null"); + if (ffiTableProviderPtr == 0) { + throw new IllegalArgumentException( + "registerFfiTable ffiTableProviderPtr must be a non-null FFI_TableProvider pointer"); } - registerRerunTableNative(nativeHandle, name, optionsProtoBytes); - } - - public void registerRerunTable(String name, RerunTableOptions options) { - if (nativeHandle == 0) { - throw new IllegalStateException("SessionContext is closed"); - } - if (name == null || name.isEmpty()) { - throw new IllegalArgumentException("registerRerunTable name must be non-empty"); - } - if (options == null) { - throw new IllegalArgumentException("registerRerunTable options must be non-null"); - } - registerRerunTableNative(nativeHandle, name, options.toProtoBytes()); - } - - /** - * Enumerate segment ids for the dataset described by {@code options}. Spark uses this on the - * driver to plan one input partition per segment; on the executor, the connector re-builds the - * same options with {@link RerunTableOptions#withSegments} narrowed to a single segment and calls - * {@link #registerRerunTable}. - * - *

This is logically a static operation — it makes its own gRPC connection and does not need a - * {@link SessionContext}. Exposed as a static method on this class so all native entry points - * live together. - */ - public static String[] listRerunSegments(RerunTableOptions options) { - if (options == null) { - throw new IllegalArgumentException("listRerunSegments options must be non-null"); - } - String[] result = listRerunSegmentsNative(options.toProtoBytes()); - return result == null ? new String[0] : result; + registerFfiTableNative(nativeHandle, name, ffiTableProviderPtr); } public void registerTable(String name, TableProvider provider) { @@ -734,8 +695,6 @@ private static native void registerScalarUdf( private static native void registerTableNative( long handle, String name, byte[] schemaIpcBytes, TableProvider provider); - private static native void registerRerunTableNative( - long handle, String name, byte[] optionsProtoBytes); - - private static native String[] listRerunSegmentsNative(byte[] optionsProtoBytes); + private static native void registerFfiTableNative( + long handle, String name, long ffiTableProviderPtr); } diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000..beb7763 --- /dev/null +++ b/examples/README.md @@ -0,0 +1,58 @@ +# DataFusion-Java examples + +Self-contained Java programs against the DataFusion-Java API. + +`exec:exec` (not `exec:java`) runs each one — the pom shells out to a fresh +`java` process so the JNI library's `--add-opens=java.base/java.nio=ALL-UNNAMED` +JVM flag actually applies. + +`exec:exec` is a separate Maven invocation from the one that built the +project, so it resolves `datafusion-java` from your local Maven repository +rather than the reactor's `target/` dirs. That means the parent must be +**installed** to the local repo first — `package -am` builds the jar but +does NOT publish it, which surfaces as +`Could not find artifact org.apache.datafusion:datafusion-java:jar:0.2.0-SNAPSHOT`. + +```bash +# Install the fork into your local Maven repo, then run any example. +mvn -B install -DskipTests -Drat.skip=true \ + -Ddatafusion.native.profile=release +mvn -B -pl examples exec:exec \ + -Dexec.mainClass=org.apache.datafusion.examples. +``` + +(If your local Maven repo lives somewhere other than `~/.m2/repository`, +add `-Dmaven.repo.local=/path/to/repo` to BOTH invocations.) + +| Class | What it shows | +| -------------------------------- | --------------------------------------------------------------------------------------------- | +| `SqlQueryExample` | Register a CSV file and run a SQL aggregation. | +| `DataFrameExample` | DataFrame API: filter, group, sort. | +| `ProtoPlanExample` | Build a `LogicalPlanNode` proto in Java, hand it to `SessionContext.fromProto`. | +| `JdbcExample` | Pull from an H2 JDBC source into Arrow, register it, query. | +| `AddOneExample` | Implement a Scalar UDF in Java and register it on the session. | +| `NestedTypeUdfExample` | Scalar UDF over `List` — input + output nested arrow types. | +| `FfiTableProviderExample` | Build an `FFI_TableProvider` in Rust (a `MemTable`), hand the raw pointer to the JVM, register it via `SessionContext.registerFfiTable`, run SQL. **See also: [SPARK_INTEGRATION.md](SPARK_INTEGRATION.md).** | + +## Building the FFI example's cdylib + +The `FfiTableProviderExample` relies on a small Rust cdylib under +[`native/`](native/) — built independently from the main `datafusion-jni` +crate: + +```bash +cd examples/native +cargo build --release +``` + +The example's `System.load` searches the following paths in order: + +1. `-Dexample.ffi.lib.path=/abs/path/to/lib...` (explicit override) +2. `examples/native/target/release/` (Maven's cwd is the repo root) +3. `examples/native/target/debug/` +4. `native/target/release/` (cwd inside the `examples` module) +5. `native/target/debug/` + +Where `` is `libdatafusion_java_ffi_example.so` on Linux, +`libdatafusion_java_ffi_example.dylib` on macOS, or +`datafusion_java_ffi_example.dll` on Windows. diff --git a/examples/SPARK_INTEGRATION.md b/examples/SPARK_INTEGRATION.md new file mode 100644 index 0000000..690a91a --- /dev/null +++ b/examples/SPARK_INTEGRATION.md @@ -0,0 +1,168 @@ +# Using an FFI TableProvider as a Spark Data Source + +The [`FfiTableProviderExample`](src/main/java/org/apache/datafusion/examples/FfiTableProviderExample.java) +shows the JVM side of the FFI handover: Rust builds an `FFI_TableProvider`, +hands the raw pointer to the JVM, and the JVM calls +`SessionContext.registerFfiTable(name, ptr)` to make it queryable through +DataFusion-Java. + +That same flow plugs into Apache Spark as a DataSource V2 by way of the +[`connector-core`](https://github.com/rerun-io/rerun-spark-connector) module +(generic Spark plumbing donated upstream-ready). Below is the recipe for +wiring a domain bridge — e.g. an in-house format or a custom catalog — into +Spark via this pattern. + +## Architecture + +``` ++--------------------------+ +------------------------------+ +| Your bridge cdylib | byte[] opts | Your bridge JVM glue | +| - Rust JNI: | <----+ | - Java POJO + proto encoder | +| createFfiProvider | | | - FfiProviderFactory impl | +| listPartitions | | jlong | - System.load(cdylib) | +| - FFI_TableProvider | <----+----+----+-------- driver / executor | ++--------------------------+ raw ptr +------------------------------+ + | + v ++--------------------------+ +------------------------------+ +| connector-core cdylib | jlong (wide) | connector-core JVM | +| - WideningTableProvider | <------------- | - DatafusionSource (DSv2) | +| over arrow::cast | | - SparkPredicateTranslator | ++--------------------------+ | - ColumnarPartitionReader | + +------------------------------+ + | + v + +------------------------------+ + | datafusion-java | + | - SessionContext | + | - registerFfiTable(name,ptr)| + | - DataFrame.filterFromProto | + +------------------------------+ +``` + +Key invariants: + +- Only the opaque `FFI_TableProvider` pointer crosses the cdylib boundary. + No `SessionContext` is ever shared. +- The widening cdylib (connector-core) sits between your bridge and + `registerFfiTable`. It casts Spark-incompatible Arrow types (UInt*, Float16, + Time*, non-µs Timestamp, recursive List/LargeList/FixedSizeList) using + kernel-level `arrow::compute::cast`. No SQL, no view rewrites. +- Predicate pushdown crosses the FFI boundary as a `LogicalExprNode` proto + (datafusion-proto). Spark translates V2 `Predicate`s and ships the bytes; + the producer's `TableProviderFilterPushDown::scan(...)` sees them as Rust + `Expr`s. + +## Producer side (Rust) + +Your bridge cdylib exposes a `createFfiProvider` JNI entrypoint that decodes +your domain proto, builds an `Arc`, and wraps it in +`FFI_TableProvider`. This is exactly what +[`examples/native/src/lib.rs`](native/src/lib.rs) does for a `MemTable`. For +a real bridge, replace the `MemTable` with your own `TableProvider` +implementation: + +```rust +let provider: Arc = runtime().block_on(build_provider(opts))?; +let ffi = FFI_TableProvider::new( + provider, + /*can_support_pushdown_filters=*/ true, + Some(runtime().clone()), + FFI_TaskContextProvider::from(&ctx_provider), // throwaway local SessionContext + /*logical_codec=*/ None, // default DataFusion codec +); +Box::into_raw(Box::new(ffi)) as jlong +``` + +Driver-side partition enumeration goes through a second JNI entrypoint +`listPartitions(options_proto_bytes) -> String[]`. One Spark task gets created +per returned id. + +## JVM glue + +Implement `io.datafusion.spark.FfiProviderFactory` (from +[`connector-core`](https://github.com/rerun-io/rerun-spark-connector/blob/main/connector-core/src/main/java/io/datafusion/spark/FfiProviderFactory.java)). +Must be no-arg constructable so executors can instantiate it via +`Class.forName(...).getDeclaredConstructor().newInstance()`. + +```java +public final class MyBridgeProviderFactory implements FfiProviderFactory { + + @Override + public byte[] encodeOptions(Map sparkOptions) { + // Translate Spark options ("url", "table", ...) into your proto. + return MyBridgeOptions.fromMap(sparkOptions).toProtoBytes(); + } + + @Override + public String[] listPartitions(byte[] optionsProtoBytes) { + return MyBridgeNative.listPartitions(optionsProtoBytes); + } + + @Override + public long createProvider(byte[] optionsProtoBytes) { + return MyBridgeNative.createFfiProvider(optionsProtoBytes); + } +} +``` + +## Wiring it into Spark + +Two paths, pick one: + +### Option A — config option per use + +```python +df = (spark.read.format("datafusion") + .option("df.factory", "com.example.MyBridgeProviderFactory") + .option("url", "rerun+http://localhost:51234") + .option("table", "my_dataset") + .load()) +df.printSchema() +df.filter("ts > 1700000000").show() +``` + +### Option B — thin shim with a short name + +Mirror the +[`rerun-connector`](https://github.com/rerun-io/rerun-spark-connector/blob/main/rerun-connector/src/main/scala/io/rerun/spark/RerunDataSource.scala) +shim — a ~20-line subclass that bakes the factory FQCN in: + +```scala +class MyDataSource extends DatafusionSource { + override def shortName(): String = "my_format" + override protected def factoryFqcn(opts: CaseInsensitiveStringMap): String = + "com.example.MyBridgeProviderFactory" +} +``` + +Register via `META-INF/services/org.apache.spark.sql.sources.DataSourceRegister`, +then: + +```python +df = (spark.read.format("my_format") + .option("url", "...") + .option("table", "...") + .load()) +``` + +## What runs where + +| Phase | Where | Path | +| --------------------------- | --------- | ---- | +| `inferSchema` | Driver | `factory.encodeOptions` → `factory.createProvider` → widen → `registerFfiTable` → `ctx.tableSchema` | +| `planInputPartitions` | Driver | `factory.listPartitions(optionsBytes)` → one task per id | +| Predicate translation | Driver | `SparkPredicateTranslator.translate(Predicate)` → `LogicalExprNode` proto bytes (each pushed predicate is independent) | +| Per-task scan | Executor | Same factory → widen → `registerFfiTable` → `ctx.sql("SELECT proj FROM t")` → fold `DataFrame.filterFromProto(bytes)` over pushed predicates → `executeStream` | + +## Caveats + +- One `FFI_LogicalExtensionCodec` per provider — v1 uses + `DefaultLogicalExtensionCodec`. If your bridge serializes custom + `LogicalNode`s, swap the codec at `FFI_TableProvider::new` time. +- Each cdylib brings its own Tokio runtime and (for TLS-using bridges) its + own rustls install. Both should be `Once`-gated. +- The widening cdylib in `connector-core` covers top-level scalars + List + children. Nested unsigned inside `Struct`/`Map` still surfaces the raw + Arrow type to Spark and fails at column-vector accessor time. Extend + `arrow_cast_widening` if you hit this. diff --git a/examples/native/Cargo.lock b/examples/native/Cargo.lock new file mode 100644 index 0000000..0f5ee7a --- /dev/null +++ b/examples/native/Cargo.lock @@ -0,0 +1,3653 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "abi_stable" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69d6512d3eb05ffe5004c59c206de7f99c34951504056ce23fc953842f12c445" +dependencies = [ + "abi_stable_derive", + "abi_stable_shared", + "const_panic", + "core_extensions", + "crossbeam-channel", + "generational-arena", + "libloading", + "lock_api", + "parking_lot", + "paste", + "repr_offset", + "rustc_version", + "serde", + "serde_derive", + "serde_json", +] + +[[package]] +name = "abi_stable_derive" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7178468b407a4ee10e881bc7a328a65e739f0863615cca4429d43916b05e898" +dependencies = [ + "abi_stable_shared", + "as_derive_utils", + "core_extensions", + "proc-macro2", + "quote", + "rustc_version", + "syn 1.0.109", + "typed-arena", +] + +[[package]] +name = "abi_stable_shared" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2b5df7688c123e63f4d4d649cba63f2967ba7f7861b1664fca3f77d3dad2b63" +dependencies = [ + "core_extensions", +] + +[[package]] +name = "adler2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" + +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if", + "const-random", + "getrandom 0.3.4", + "once_cell", + "version_check", + "zerocopy", +] + +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "alloc-no-stdlib" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3" + +[[package]] +name = "alloc-stdlib" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece" +dependencies = [ + "alloc-no-stdlib", +] + +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "anyhow" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" + +[[package]] +name = "ar_archive_writer" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4087686b4b0a3427190bae57a1d9a478dbb2d40c5dc1bd6e2b6d797913bdd348" +dependencies = [ + "object", +] + +[[package]] +name = "arrayref" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" + +[[package]] +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + +[[package]] +name = "arrow" +version = "58.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "378530e55cd479eda3c14eb345310799717e6f76d0c332041e8487022166b471" +dependencies = [ + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-csv", + "arrow-data", + "arrow-ipc", + "arrow-json", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "arrow-string", +] + +[[package]] +name = "arrow-arith" +version = "58.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0ab212d2c1886e802f51c5212d78ebbcbb0bec980fff9dadc1eb8d45cd0b738" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "num-traits", +] + +[[package]] +name = "arrow-array" +version = "58.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfd33d3e92f207444098c75b42de99d329562be0cf686b307b097cc52b4e999e" +dependencies = [ + "ahash", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "chrono-tz", + "half", + "hashbrown 0.17.1", + "num-complex", + "num-integer", + "num-traits", +] + +[[package]] +name = "arrow-buffer" +version = "58.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c6cd424c2693bcdbc150d843dc9d4d137dd2de4782ce6df491ad11a3a0416c0" +dependencies = [ + "bytes", + "half", + "num-bigint", + "num-traits", +] + +[[package]] +name = "arrow-cast" +version = "58.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c5aefb56a2c02e9e2b30746241058b85f8983f0fcff2ba0c6d09006e1cded7f" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-ord", + "arrow-schema", + "arrow-select", + "atoi", + "base64", + "chrono", + "comfy-table", + "half", + "lexical-core", + "num-traits", + "ryu", +] + +[[package]] +name = "arrow-csv" +version = "58.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e94e8cf7e517657a52b91ea1263acf38c4ca62a84655d72458a3359b12ab97de" +dependencies = [ + "arrow-array", + "arrow-cast", + "arrow-schema", + "chrono", + "csv", + "csv-core", + "regex", +] + +[[package]] +name = "arrow-data" +version = "58.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c88210023a2bfee1896af366309a3028fc3bcbd6515fa29a7990ee1baa08ee0" +dependencies = [ + "arrow-buffer", + "arrow-schema", + "half", + "num-integer", + "num-traits", +] + +[[package]] +name = "arrow-ipc" +version = "58.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "238438f0834483703d88896db6fe5a7138b2230debc31b34c0336c2996e3c64f" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "flatbuffers", + "lz4_flex", + "zstd", +] + +[[package]] +name = "arrow-json" +version = "58.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "205ca2119e6d679d5c133c6f30e68f027738d95ed948cf77677ea69c7800036b" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-ord", + "arrow-schema", + "arrow-select", + "chrono", + "half", + "indexmap", + "itoa", + "lexical-core", + "memchr", + "num-traits", + "ryu", + "serde_core", + "serde_json", + "simdutf8", +] + +[[package]] +name = "arrow-ord" +version = "58.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bffd8fd2579286a5d63bac898159873e5094a79009940bcb42bbfce4f19f1d0" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", +] + +[[package]] +name = "arrow-row" +version = "58.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bab5994731204603c73ba69267616c50f80780774c6bb0476f1f830625115e0c" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "half", +] + +[[package]] +name = "arrow-schema" +version = "58.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f633dbfdf39c039ada1bf9e34c694816eb71fbb7dc78f613993b7245e078a1ed" +dependencies = [ + "bitflags", + "serde_core", + "serde_json", +] + +[[package]] +name = "arrow-select" +version = "58.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8cd065c54172ac787cf3f2f8d4107e0d3fdc26edba76fdf4f4cc170258942222" +dependencies = [ + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "num-traits", +] + +[[package]] +name = "arrow-string" +version = "58.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29dd7cda3ab9692f43a2e4acc444d760cc17b12bb6d8232ddf64e9bab7c06b42" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "memchr", + "num-traits", + "regex", + "regex-syntax", +] + +[[package]] +name = "as_derive_utils" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff3c96645900a44cf11941c111bd08a6573b0e2f9f69bc9264b179d8fae753c4" +dependencies = [ + "core_extensions", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "async-compression" +version = "0.4.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e79b3f8a79cccc2898f31920fc69f304859b3bd567490f75ebf51ae1c792a9ac" +dependencies = [ + "compression-codecs", + "compression-core", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "async-ffi" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4de21c0feef7e5a556e51af767c953f0501f7f300ba785cc99c47bdc8081a50" +dependencies = [ + "abi_stable", +] + +[[package]] +name = "async-trait" +version = "0.1.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "atoi" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" +dependencies = [ + "num-traits", +] + +[[package]] +name = "autocfg" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2032f911046de80f0a198e0901378627c33f59ea0ac00e363d481118bd70a53" + +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + +[[package]] +name = "bigdecimal" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d6867f1565b3aad85681f1015055b087fcfd840d6aeee6eee7f2da317603695" +dependencies = [ + "autocfg", + "libm", + "num-bigint", + "num-integer", + "num-traits", +] + +[[package]] +name = "bitflags" +version = "2.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8" + +[[package]] +name = "blake2" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46502ad458c9a52b69d4d4d32775c788b7a1b85e8bc9d482d92250fc0e3f8efe" +dependencies = [ + "digest", +] + +[[package]] +name = "blake3" +version = "1.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0aa83c34e62843d924f905e0f5c866eb1dd6545fc4d719e803d9ba6030371fce" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if", + "constant_time_eq", + "cpufeatures 0.3.0", +] + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "brotli" +version = "8.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8119e4516436f5708bbc474a9d395bf12f1b5395e93a92a56e647ac3388c8610" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", + "brotli-decompressor", +] + +[[package]] +name = "brotli-decompressor" +version = "5.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5962523e1b92ce1b5e793d9169b9943eece10d39f62550bc04bb605d75b94924" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", +] + +[[package]] +name = "bumpalo" +version = "3.20.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72f5acc6cb2ba439de613abc23857ec3d78374d8ed5ac84e9d11336e87da8649" + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "bytes" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" + +[[package]] +name = "bzip2" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3a53fac24f34a81bc9954b5d6cfce0c21e18ec6959f44f56e8e90e4bb7c346c" +dependencies = [ + "libbz2-rs-sys", +] + +[[package]] +name = "cc" +version = "1.2.63" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "556e016178bb5662a08681bbe0f00f8e17631781a4dfc8c45e466e4b185ec27f" +dependencies = [ + "find-msvc-tools", + "jobserver", + "libc", + "shlex", +] + +[[package]] +name = "cesu8" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c" + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "chrono" +version = "0.4.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1aa79e62e7697b8e29b513a68abacf485adcd1fe8284a4316c5ae868e6633327" +dependencies = [ + "iana-time-zone", + "num-traits", + "windows-link", +] + +[[package]] +name = "chrono-tz" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6139a8597ed92cf816dfb33f5dd6cf0bb93a6adc938f11039f371bc5bcd26c3" +dependencies = [ + "chrono", + "phf", +] + +[[package]] +name = "combine" +version = "4.6.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba5a308b75df32fe02788e748662718f03fde005016435c444eea572398219fd" +dependencies = [ + "bytes", + "memchr", +] + +[[package]] +name = "comfy-table" +version = "7.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "958c5d6ecf1f214b4c2bbbbf6ab9523a864bd136dcf71a7e8904799acfe1ad47" +dependencies = [ + "unicode-segmentation", + "unicode-width", +] + +[[package]] +name = "compression-codecs" +version = "0.4.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce2548391e9c1929c21bf6aa2680af86fe4c1b33e6cea9ac1cfeec0bd11218cf" +dependencies = [ + "bzip2", + "compression-core", + "flate2", + "liblzma", + "memchr", + "zstd", + "zstd-safe", +] + +[[package]] +name = "compression-core" +version = "0.4.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc14f565cf027a105f7a44ccf9e5b424348421a1d8952a8fc9d499d313107789" + +[[package]] +name = "const-random" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359" +dependencies = [ + "const-random-macro", +] + +[[package]] +name = "const-random-macro" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" +dependencies = [ + "getrandom 0.2.17", + "once_cell", + "tiny-keccak", +] + +[[package]] +name = "const_panic" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e262cdaac42494e3ae34c43969f9cdeb7da178bdb4b66fa6a1ea2edb4c8ae652" +dependencies = [ + "typewit", +] + +[[package]] +name = "constant_time_eq" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" + +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + +[[package]] +name = "core_extensions" +version = "1.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42bb5e5d0269fd4f739ea6cedaf29c16d81c27a7ce7582008e90eb50dcd57003" +dependencies = [ + "core_extensions_proc_macros", +] + +[[package]] +name = "core_extensions_proc_macros" +version = "1.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "533d38ecd2709b7608fb8e18e4504deb99e9a72879e6aa66373a76d8dc4259ea" + +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + +[[package]] +name = "cpufeatures" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201" +dependencies = [ + "libc", +] + +[[package]] +name = "crc32fast" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crunchy" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" + +[[package]] +name = "crypto-common" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "csv" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde_core", +] + +[[package]] +name = "csv-core" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782" +dependencies = [ + "memchr", +] + +[[package]] +name = "dashmap" +version = "6.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6361d5c062261c78a176addb82d4c821ae42bed6089de0e12603cd25de2059c" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core", +] + +[[package]] +name = "datafusion" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93db0e623840612f7f2cd757f7e8a8922064192363732c88692e0870016e141b" +dependencies = [ + "arrow", + "arrow-schema", + "async-trait", + "bytes", + "bzip2", + "chrono", + "datafusion-catalog", + "datafusion-catalog-listing", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-datasource-arrow", + "datafusion-datasource-csv", + "datafusion-datasource-json", + "datafusion-datasource-parquet", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-functions", + "datafusion-functions-aggregate", + "datafusion-functions-nested", + "datafusion-functions-table", + "datafusion-functions-window", + "datafusion-optimizer", + "datafusion-physical-expr", + "datafusion-physical-expr-adapter", + "datafusion-physical-expr-common", + "datafusion-physical-optimizer", + "datafusion-physical-plan", + "datafusion-session", + "datafusion-sql", + "flate2", + "futures", + "itertools", + "liblzma", + "log", + "object_store", + "parking_lot", + "parquet", + "rand", + "regex", + "sqlparser", + "tempfile", + "tokio", + "url", + "uuid", + "zstd", +] + +[[package]] +name = "datafusion-catalog" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37cefde60b26a7f4ff61e9d2ff2833322f91df2b568d7238afe67bde5bdffb66" +dependencies = [ + "arrow", + "async-trait", + "dashmap", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "itertools", + "log", + "object_store", + "parking_lot", + "tokio", +] + +[[package]] +name = "datafusion-catalog-listing" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17e112307715d6a7a331111a4c2330ff54bc237183511c319e3708a4cff431fb" +dependencies = [ + "arrow", + "async-trait", + "datafusion-catalog", + "datafusion-common", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-adapter", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "futures", + "itertools", + "log", + "object_store", +] + +[[package]] +name = "datafusion-common" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d72a11ca44a95e1081870d3abb80c717496e8a7acb467a1d3e932bb636af5cc2" +dependencies = [ + "ahash", + "arrow", + "arrow-ipc", + "chrono", + "half", + "hashbrown 0.16.1", + "indexmap", + "itertools", + "libc", + "log", + "object_store", + "parquet", + "paste", + "recursive", + "sqlparser", + "tokio", + "web-time", +] + +[[package]] +name = "datafusion-common-runtime" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89f4afaed29670ec4fd6053643adc749fe3f4bc9d1ce1b8c5679b22c67d12def" +dependencies = [ + "futures", + "log", + "tokio", +] + +[[package]] +name = "datafusion-datasource" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9fb386e1691355355a96419978a0022b7947b44d4a24a6ea99f00b6b485cbb6" +dependencies = [ + "arrow", + "async-compression", + "async-trait", + "bytes", + "bzip2", + "chrono", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-adapter", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "flate2", + "futures", + "glob", + "itertools", + "liblzma", + "log", + "object_store", + "rand", + "tokio", + "tokio-util", + "url", + "zstd", +] + +[[package]] +name = "datafusion-datasource-arrow" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffa6c52cfed0734c5f93754d1c0175f558175248bf686c944fb05c373e5fc096" +dependencies = [ + "arrow", + "arrow-ipc", + "async-trait", + "bytes", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "itertools", + "object_store", + "tokio", +] + +[[package]] +name = "datafusion-datasource-csv" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "503f29e0582c1fc189578d665ff57d9300da1f80c282777d7eb67bb79fb8cdca" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "object_store", + "regex", + "tokio", +] + +[[package]] +name = "datafusion-datasource-json" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e33804749abc8d0c8cb7473228483cb8070e524c6f6086ee1b85a64debe2b3d2" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "object_store", + "serde_json", + "tokio", + "tokio-stream", +] + +[[package]] +name = "datafusion-datasource-parquet" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a8e0365e0e08e8ff94d912f0ababcf9065a1a304018ba90b1fc83c855b4997" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-aggregate-common", + "datafusion-physical-expr", + "datafusion-physical-expr-adapter", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-pruning", + "datafusion-session", + "futures", + "itertools", + "log", + "object_store", + "parking_lot", + "parquet", + "tokio", +] + +[[package]] +name = "datafusion-doc" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8de6ac0df1662b9148ad3c987978b32cbec7c772f199b1d53520c8fa764a87ee" + +[[package]] +name = "datafusion-execution" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c03c7fbdaefcca4ef6ffe425a5fc2325763bfb426599bb0bf4536466efabe709" +dependencies = [ + "arrow", + "arrow-buffer", + "async-trait", + "chrono", + "dashmap", + "datafusion-common", + "datafusion-expr", + "datafusion-physical-expr-common", + "futures", + "log", + "object_store", + "parking_lot", + "rand", + "tempfile", + "url", +] + +[[package]] +name = "datafusion-expr" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "574b9b6977fedbd2a611cbff12e5caf90f31640ad9dc5870f152836d94bad0dd" +dependencies = [ + "arrow", + "async-trait", + "chrono", + "datafusion-common", + "datafusion-doc", + "datafusion-expr-common", + "datafusion-functions-aggregate-common", + "datafusion-functions-window-common", + "datafusion-physical-expr-common", + "indexmap", + "itertools", + "paste", + "recursive", + "serde_json", + "sqlparser", +] + +[[package]] +name = "datafusion-expr-common" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d7c3adf3db8bf61e92eb90cb659c8e8b734593a8f7c8e12a843c7ddba24b87e" +dependencies = [ + "arrow", + "datafusion-common", + "indexmap", + "itertools", + "paste", +] + +[[package]] +name = "datafusion-ffi" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b95173344d04ba62755c949bf44f8d1a6e4414cf6392a635db96c07e711b9a3c" +dependencies = [ + "abi_stable", + "arrow", + "arrow-schema", + "async-ffi", + "async-trait", + "datafusion-catalog", + "datafusion-common", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-aggregate-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-proto", + "datafusion-proto-common", + "datafusion-session", + "futures", + "log", + "prost", + "semver", + "tokio", +] + +[[package]] +name = "datafusion-functions" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28aa4e10384e782774b10e72aca4d93ef7b31aa653095d9d4536b0a3dbc51b6" +dependencies = [ + "arrow", + "arrow-buffer", + "base64", + "blake2", + "blake3", + "chrono", + "chrono-tz", + "datafusion-common", + "datafusion-doc", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-macros", + "hex", + "itertools", + "log", + "md-5", + "memchr", + "num-traits", + "rand", + "regex", + "sha2", + "unicode-segmentation", + "uuid", +] + +[[package]] +name = "datafusion-functions-aggregate" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00aa6217e56098ba84e0a338176fe52f0a84cca398021512c6c8c5eff806d0ad" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-doc", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-aggregate-common", + "datafusion-macros", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "half", + "log", + "num-traits", + "paste", +] + +[[package]] +name = "datafusion-functions-aggregate-common" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b511250349407db7c43832ab2de63f5557b19a20dfd236b39ca2c04468b50d47" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-expr-common", + "datafusion-physical-expr-common", +] + +[[package]] +name = "datafusion-functions-nested" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef13a858e20d50f0a9bb5e96e7ac82b4e7597f247515bccca4fdd2992df0212a" +dependencies = [ + "arrow", + "arrow-ord", + "datafusion-common", + "datafusion-doc", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-functions", + "datafusion-functions-aggregate", + "datafusion-functions-aggregate-common", + "datafusion-macros", + "datafusion-physical-expr-common", + "hashbrown 0.16.1", + "itertools", + "itoa", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-table" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b40d3f5bbb3905f9ccb1ce9485a9595c77b69758a7c24d3ba79e334ff51e7e" +dependencies = [ + "arrow", + "async-trait", + "datafusion-catalog", + "datafusion-common", + "datafusion-expr", + "datafusion-physical-plan", + "parking_lot", + "paste", +] + +[[package]] +name = "datafusion-functions-window" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4e88ec9d57c9b685d02f58bfee7be62d72610430ddcedb82a08e5d9925dbfb6" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-doc", + "datafusion-expr", + "datafusion-functions-window-common", + "datafusion-macros", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-window-common" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8307bb93519b1a91913723a1130cfafeee3f72200d870d88e91a6fc5470ede5c" +dependencies = [ + "datafusion-common", + "datafusion-physical-expr-common", +] + +[[package]] +name = "datafusion-java-ffi-example" +version = "0.1.0" +dependencies = [ + "arrow", + "datafusion", + "datafusion-ffi", + "jni", + "tokio", +] + +[[package]] +name = "datafusion-macros" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e367e6a71051d0ebdd29b2f85d12059b38b1d1f172c6906e80016da662226bd" +dependencies = [ + "datafusion-doc", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "datafusion-optimizer" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e929015451a67f77d9d8b727b2bf3a40c4445fdef6cdc53281d7d97c76888ace" +dependencies = [ + "arrow", + "chrono", + "datafusion-common", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-physical-expr", + "indexmap", + "itertools", + "log", + "recursive", + "regex", + "regex-syntax", +] + +[[package]] +name = "datafusion-physical-expr" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b1e68aba7a4b350401cfdf25a3d6f989ad898a7410164afe9ca52080244cb59" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-functions-aggregate-common", + "datafusion-physical-expr-common", + "half", + "hashbrown 0.16.1", + "indexmap", + "itertools", + "parking_lot", + "paste", + "petgraph", + "recursive", + "tokio", +] + +[[package]] +name = "datafusion-physical-expr-adapter" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea22315f33cf2e0adc104e8ec42e285f6ed93998d565c65e82fec6a9ee9f9db4" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-expr", + "datafusion-functions", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "itertools", +] + +[[package]] +name = "datafusion-physical-expr-common" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b04b45ea8ad3ac2d78f2ea2a76053e06591c9629c7a603eda16c10649ecf4362" +dependencies = [ + "ahash", + "arrow", + "chrono", + "datafusion-common", + "datafusion-expr-common", + "hashbrown 0.16.1", + "indexmap", + "itertools", + "parking_lot", +] + +[[package]] +name = "datafusion-physical-optimizer" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cb13397809a425918f608dfe8653f332015a3e330004ab191b4404187238b95" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-pruning", + "itertools", + "recursive", +] + +[[package]] +name = "datafusion-physical-plan" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5edc023675791af9d5fb4cc4c24abf5f7bd3bd4dcf9e5bd90ea1eff6976dcc79" +dependencies = [ + "ahash", + "arrow", + "arrow-ord", + "arrow-schema", + "async-trait", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions", + "datafusion-functions-aggregate-common", + "datafusion-functions-window-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "futures", + "half", + "hashbrown 0.16.1", + "indexmap", + "itertools", + "log", + "num-traits", + "parking_lot", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "datafusion-proto" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a387aaef949dc16bb6abc81bd1af850ec7449183aef011214f9724957495738" +dependencies = [ + "arrow", + "chrono", + "datafusion-catalog", + "datafusion-catalog-listing", + "datafusion-common", + "datafusion-datasource", + "datafusion-datasource-arrow", + "datafusion-datasource-csv", + "datafusion-datasource-json", + "datafusion-datasource-parquet", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-table", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-proto-common", + "object_store", + "prost", + "rand", +] + +[[package]] +name = "datafusion-proto-common" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16e614c7c53a9c304c6a850b821010bb492e57300311835f1180613f9d2c63d9" +dependencies = [ + "arrow", + "datafusion-common", + "prost", +] + +[[package]] +name = "datafusion-pruning" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac8c76860e355616555081cab5968cec1af7a80701ff374510860bcd567e365a" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-datasource", + "datafusion-expr-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "itertools", + "log", +] + +[[package]] +name = "datafusion-session" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5412111aa48e2424ba926112e192f7a6b7e4ccb450145d25ce5ede9f19dc491e" +dependencies = [ + "async-trait", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-plan", + "parking_lot", +] + +[[package]] +name = "datafusion-sql" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa0d133ddf8b9b3b872acac900157f783e7b879fe9a6bccf389abebbfac45ec1" +dependencies = [ + "arrow", + "bigdecimal", + "chrono", + "datafusion-common", + "datafusion-expr", + "datafusion-functions-nested", + "indexmap", + "log", + "recursive", + "regex", + "sqlparser", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", + "subtle", +] + +[[package]] +name = "displaydoc" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ac70aa55017e108007fbaf5aa0f54b021c98f92ff8af59d42eda9da96e3dd4f" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "either" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e" + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys 0.61.2", +] + +[[package]] +name = "fastrand" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" + +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + +[[package]] +name = "fixedbitset" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" + +[[package]] +name = "flatbuffers" +version = "25.12.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35f6839d7b3b98adde531effaf34f0c2badc6f4735d26fe74709d8e513a96ef3" +dependencies = [ + "bitflags", + "rustc_version", +] + +[[package]] +name = "flate2" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" +dependencies = [ + "crc32fast", + "miniz_oxide", + "zlib-rs", +] + +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + +[[package]] +name = "form_urlencoded" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "futures" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" + +[[package]] +name = "futures-executor" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718" + +[[package]] +name = "futures-macro" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "futures-sink" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893" + +[[package]] +name = "futures-task" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" + +[[package]] +name = "futures-util" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "slab", +] + +[[package]] +name = "generational-arena" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877e94aff08e743b651baaea359664321055749b398adff8740a7399af7796e7" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "getrandom" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "libc", + "r-efi 5.3.0", + "wasip2", +] + +[[package]] +name = "getrandom" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" +dependencies = [ + "cfg-if", + "libc", + "r-efi 6.0.0", + "wasip2", + "wasip3", +] + +[[package]] +name = "glob" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" + +[[package]] +name = "half" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" +dependencies = [ + "cfg-if", + "crunchy", + "num-traits", + "zerocopy", +] + +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" + +[[package]] +name = "hashbrown" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "foldhash 0.1.5", +] + +[[package]] +name = "hashbrown" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash 0.2.0", +] + +[[package]] +name = "hashbrown" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + +[[package]] +name = "http" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6970f50e31d6fc17d3fa27329444bfa74e196cf62e95052a3f6fee181dba6425" +dependencies = [ + "bytes", + "itoa", +] + +[[package]] +name = "humantime" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424" + +[[package]] +name = "iana-time-zone" +version = "0.1.65" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e31bc9ad994ba00e440a8aa5c9ef0ec67d5cb5e5cb0cc7f8b744a35b389cc470" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "log", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "icu_collections" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2984d1cd16c883d7935b9e07e44071dca8d917fd52ecc02c04d5fa0b5a3f191c" +dependencies = [ + "displaydoc", + "potential_utf", + "utf8_iter", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locale_core" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92219b62b3e2b4d88ac5119f8904c10f8f61bf7e95b640d25ba3075e6cac2c29" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_normalizer" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c56e5ee99d6e3d33bd91c5d85458b6005a22140021cc324cea84dd0e72cff3b4" +dependencies = [ + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da3be0ae77ea334f4da67c12f149704f19f81d1adf7c51cf482943e84a2bad38" + +[[package]] +name = "icu_properties" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bee3b67d0ea5c2cca5003417989af8996f8604e34fb9ddf96208a033901e70de" +dependencies = [ + "icu_collections", + "icu_locale_core", + "icu_properties_data", + "icu_provider", + "zerotrie", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e2bbb201e0c04f7b4b3e14382af113e17ba4f63e2c9d2ee626b720cbce54a14" + +[[package]] +name = "icu_provider" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "139c4cf31c8b5f33d7e199446eff9c1e02decfc2f0eec2c8d71f65befa45b421" +dependencies = [ + "displaydoc", + "icu_locale_core", + "writeable", + "yoke", + "zerofrom", + "zerotrie", + "zerovec", +] + +[[package]] +name = "id-arena" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" + +[[package]] +name = "idna" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb68373c0d6620ef8105e855e7745e18b0d00d3bdb07fb532e434244cdb9a714" +dependencies = [ + "icu_normalizer", + "icu_properties", +] + +[[package]] +name = "indexmap" +version = "2.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" +dependencies = [ + "equivalent", + "hashbrown 0.17.1", + "serde", + "serde_core", +] + +[[package]] +name = "integer-encoding" +version = "3.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" + +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + +[[package]] +name = "jni" +version = "0.21.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a87aa2bb7d2af34197c04845522473242e1aa17c12f4935d5856491a7fb8c97" +dependencies = [ + "cesu8", + "cfg-if", + "combine", + "jni-sys 0.3.1", + "log", + "thiserror 1.0.69", + "walkdir", + "windows-sys 0.45.0", +] + +[[package]] +name = "jni-sys" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41a652e1f9b6e0275df1f15b32661cf0d4b78d4d87ddec5e0c3c20f097433258" +dependencies = [ + "jni-sys 0.4.1", +] + +[[package]] +name = "jni-sys" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6377a88cb3910bee9b0fa88d4f42e1d2da8e79915598f65fb0c7ee14c878af2" +dependencies = [ + "jni-sys-macros", +] + +[[package]] +name = "jni-sys-macros" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38c0b942f458fe50cdac086d2f946512305e5631e720728f2a61aabcd47a6264" +dependencies = [ + "quote", + "syn 2.0.117", +] + +[[package]] +name = "jobserver" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom 0.3.4", + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2025f20d7a4fa7785846e7b63d10a76d3f1cee98ee5cb79ea59703f95e42162" +dependencies = [ + "cfg-if", + "futures-util", + "wasm-bindgen", +] + +[[package]] +name = "leb128fmt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" + +[[package]] +name = "lexical-core" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d8d125a277f807e55a77304455eb7b1cb52f2b18c143b60e766c120bd64a594" +dependencies = [ + "lexical-parse-float", + "lexical-parse-integer", + "lexical-util", + "lexical-write-float", + "lexical-write-integer", +] + +[[package]] +name = "lexical-parse-float" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52a9f232fbd6f550bc0137dcb5f99ab674071ac2d690ac69704593cb4abbea56" +dependencies = [ + "lexical-parse-integer", + "lexical-util", +] + +[[package]] +name = "lexical-parse-integer" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a7a039f8fb9c19c996cd7b2fcce303c1b2874fe1aca544edc85c4a5f8489b34" +dependencies = [ + "lexical-util", +] + +[[package]] +name = "lexical-util" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2604dd126bb14f13fb5d1bd6a66155079cb9fa655b37f875b3a742c705dbed17" + +[[package]] +name = "lexical-write-float" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50c438c87c013188d415fbabbb1dceb44249ab81664efbd31b14ae55dabb6361" +dependencies = [ + "lexical-util", + "lexical-write-integer", +] + +[[package]] +name = "lexical-write-integer" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "409851a618475d2d5796377cad353802345cba92c867d9fbcde9cf4eac4e14df" +dependencies = [ + "lexical-util", +] + +[[package]] +name = "libbz2-rs-sys" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34b357333733e8260735ba5894eb928c02ecc69c78715f01a8019e7fa7f2db4c" + +[[package]] +name = "libc" +version = "0.2.186" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" + +[[package]] +name = "libloading" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" +dependencies = [ + "cfg-if", + "winapi", +] + +[[package]] +name = "liblzma" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6033b77c21d1f56deeae8014eb9fbe7bdf1765185a6c508b5ca82eeaed7f899" +dependencies = [ + "liblzma-sys", +] + +[[package]] +name = "liblzma-sys" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a60851d15cd8c5346eca4ab8babff585be2ae4bc8097c067291d3ffe2add3b6" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + +[[package]] +name = "libm" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" + +[[package]] +name = "linux-raw-sys" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" + +[[package]] +name = "litemap" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0" + +[[package]] +name = "lock_api" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" +dependencies = [ + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "953f07c43838f8e6f9758cab68bf5bed85465e7587ebe0b823f1bcd81978ad3a" + +[[package]] +name = "lz4_flex" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ef0d4ed8669f8f8826eb00dc878084aa8f253506c4fd5e8f58f5bce72ddb97e" +dependencies = [ + "twox-hash", +] + +[[package]] +name = "md-5" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +dependencies = [ + "cfg-if", + "digest", +] + +[[package]] +name = "memchr" +version = "2.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8" + +[[package]] +name = "miniz_oxide" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" +dependencies = [ + "adler2", + "simd-adler32", +] + +[[package]] +name = "num-bigint" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", + "num-traits", +] + +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", + "libm", +] + +[[package]] +name = "object" +version = "0.37.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe" +dependencies = [ + "memchr", +] + +[[package]] +name = "object_store" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "622acbc9100d3c10e2ee15804b0caa40e55c933d5aa53814cd520805b7958a49" +dependencies = [ + "async-trait", + "bytes", + "chrono", + "futures-channel", + "futures-core", + "futures-util", + "http", + "humantime", + "itertools", + "parking_lot", + "percent-encoding", + "thiserror 2.0.18", + "tokio", + "tracing", + "url", + "walkdir", + "wasm-bindgen-futures", + "web-time", +] + +[[package]] +name = "once_cell" +version = "1.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" + +[[package]] +name = "ordered-float" +version = "2.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c" +dependencies = [ + "num-traits", +] + +[[package]] +name = "parking_lot" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-link", +] + +[[package]] +name = "parquet" +version = "58.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5dafa7d01085b62a47dd0c1829550a0a36710ea9c4fe358a05a85477cec8a908" +dependencies = [ + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-ipc", + "arrow-schema", + "arrow-select", + "base64", + "brotli", + "bytes", + "chrono", + "flate2", + "futures", + "half", + "hashbrown 0.17.1", + "lz4_flex", + "num-bigint", + "num-integer", + "num-traits", + "object_store", + "paste", + "seq-macro", + "simdutf8", + "snap", + "thrift", + "tokio", + "twox-hash", + "zstd", +] + +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + +[[package]] +name = "percent-encoding" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" + +[[package]] +name = "petgraph" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455" +dependencies = [ + "fixedbitset", + "hashbrown 0.15.5", + "indexmap", + "serde", +] + +[[package]] +name = "phf" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "913273894cec178f401a31ec4b656318d95473527be05c0752cc41cdc32be8b7" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_shared" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06005508882fb681fd97892ecff4b7fd0fee13ef1aa569f8695dae7ab9099981" +dependencies = [ + "siphasher", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" + +[[package]] +name = "pkg-config" +version = "0.3.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e" + +[[package]] +name = "potential_utf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0103b1cef7ec0cf76490e969665504990193874ea05c85ff9bab8b911d0a0564" +dependencies = [ + "zerovec", +] + +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn 2.0.117", +] + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "prost" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "528ac67416ff8646872a3c02cad9cc4ee5dc9f9540c9b10771855c95cb2e5ae1" +dependencies = [ + "bytes", + "prost-derive", +] + +[[package]] +name = "prost-derive" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b570b25f7617e43d59005d0990ccb79e950a423952cea19671b7a876da390adf" +dependencies = [ + "anyhow", + "itertools", + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "psm" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "645dbe486e346d9b5de3ef16ede18c26e6c70ad97418f4874b8b1889d6e761ea" +dependencies = [ + "ar_archive_writer", + "cc", +] + +[[package]] +name = "quote" +version = "1.0.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + +[[package]] +name = "r-efi" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" + +[[package]] +name = "rand" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" +dependencies = [ + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom 0.3.4", +] + +[[package]] +name = "recursive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e" +dependencies = [ + "recursive-proc-macro-impl", + "stacker", +] + +[[package]] +name = "recursive-proc-macro-impl" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" +dependencies = [ + "quote", + "syn 2.0.117", +] + +[[package]] +name = "redox_syscall" +version = "0.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" +dependencies = [ + "bitflags", +] + +[[package]] +name = "regex" +version = "1.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1292b7759ae1cb9ec195452d1390a074f0cd8541ab7a5a8c31cd6db45d4a6ba" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4" + +[[package]] +name = "repr_offset" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb1070755bd29dffc19d0971cab794e607839ba2ef4b69a9e6fbc8733c1b72ea" +dependencies = [ + "tstr", +] + +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + +[[package]] +name = "rustix" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.61.2", +] + +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + +[[package]] +name = "ryu" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "semver" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" + +[[package]] +name = "seq-macro" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc" + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "serde_json" +version = "1.0.150" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9" +dependencies = [ + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "sha2" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if", + "cpufeatures 0.2.17", + "digest", +] + +[[package]] +name = "shlex" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8fadd59c855ef2080decdef8ff161eb6661b86933c9d82e5ba29dc602a55aba" + +[[package]] +name = "simd-adler32" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214" + +[[package]] +name = "simdutf8" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" + +[[package]] +name = "siphasher" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ee5873ec9cce0195efcb7a4e9507a04cd49aec9c83d0389df45b1ef7ba2e649" + +[[package]] +name = "slab" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "snap" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" + +[[package]] +name = "sqlparser" +version = "0.61.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbf5ea8d4d7c808e1af1cbabebca9a2abe603bcefc22294c5b95018d53200cb7" +dependencies = [ + "log", + "recursive", + "sqlparser_derive", +] + +[[package]] +name = "sqlparser_derive" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6dd45d8fc1c79299bfbb7190e42ccbbdf6a5f52e4a6ad98d92357ea965bd289" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "stable_deref_trait" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" + +[[package]] +name = "stacker" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "640c8cdd92b6b12f5bcb1803ca3bbf5ab96e5e6b6b96b9ab77dabe9e880b3190" +dependencies = [ + "cc", + "cfg-if", + "libc", + "psm", + "windows-sys 0.61.2", +] + +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "synstructure" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "tempfile" +version = "3.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" +dependencies = [ + "fastrand", + "getrandom 0.4.2", + "once_cell", + "rustix", + "windows-sys 0.61.2", +] + +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" +dependencies = [ + "thiserror-impl 2.0.18", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "thrift" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" +dependencies = [ + "byteorder", + "integer-encoding", + "ordered-float", +] + +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + +[[package]] +name = "tinystr" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8323304221c2a851516f22236c5722a72eaa19749016521d6dff0824447d96d" +dependencies = [ + "displaydoc", + "zerovec", +] + +[[package]] +name = "tokio" +version = "1.52.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fc7f01b389ac15039e4dc9531aa973a135d7a4135281b12d7c1bc79fd57fffe" +dependencies = [ + "bytes", + "pin-project-lite", + "tokio-macros", +] + +[[package]] +name = "tokio-macros" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "tokio-stream" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32da49809aab5c3bc678af03902d4ccddea2a87d028d86392a4b1560c6906c70" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", + "tokio-util", +] + +[[package]] +name = "tokio-util" +version = "0.7.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tracing" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" +dependencies = [ + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "tracing-core" +version = "0.1.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" +dependencies = [ + "once_cell", +] + +[[package]] +name = "tstr" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f8e0294f14baae476d0dd0a2d780b2e24d66e349a9de876f5126777a37bdba7" +dependencies = [ + "tstr_proc_macros", +] + +[[package]] +name = "tstr_proc_macros" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e78122066b0cb818b8afd08f7ed22f7fdbc3e90815035726f0840d0d26c0747a" + +[[package]] +name = "twox-hash" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c" + +[[package]] +name = "typed-arena" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6af6ae20167a9ece4bcb41af5b80f8a1f1df981f6391189ce00fd257af04126a" + +[[package]] +name = "typenum" +version = "1.20.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6f5e870be6c3b371b77fe0ee0bafb859fa4964b4404c27de1d380043c4dda20" + +[[package]] +name = "typewit" +version = "1.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "214ca0b2191785cbc06209b9ca1861e048e39b5ba33574b3cedd58363d5bb5f6" + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "unicode-segmentation" +version = "1.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6f5d3c3b1bf09027a88a6bc961fc00497d651009560b5463668dc81b0fa87a8" + +[[package]] +name = "unicode-width" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" + +[[package]] +name = "unicode-xid" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" + +[[package]] +name = "url" +version = "2.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", + "serde", +] + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + +[[package]] +name = "uuid" +version = "1.23.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "144d6b123cef80b301b8f72a9e2ca4370ddec21950d0a103dd22c437006d2db7" +dependencies = [ + "getrandom 0.4.2", + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + +[[package]] +name = "wasip2" +version = "1.0.3+wasi-0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6" +dependencies = [ + "wit-bindgen 0.57.1", +] + +[[package]] +name = "wasip3" +version = "0.4.0+wasi-0.3.0-rc-2026-01-06" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" +dependencies = [ + "wit-bindgen 0.51.0", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.123" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a254a4b10c19a76f09a27640e7ffbf9bc30bf67e16a3bf28aaefa4920fe81563" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.73" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54568702fabf5d4849ce2b90fadfa64168a097eaf4b351ce9df8b687a0086aaf" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.123" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24a40fc75b0ec6f3746ceb10d36f53a93dcd68a93b11b6445983945d79eba0dc" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.123" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "908f34bd9b9ce3d4caf07b72dfab63d61504d156856c6bd3cd87fa350cf3985b" +dependencies = [ + "bumpalo", + "proc-macro2", + "quote", + "syn 2.0.117", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.123" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7acbf7616c27b194bbb550bf77ed0c2c3e5b7fd1260a93082b95fb7f47959b92" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "wasm-encoder" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" +dependencies = [ + "leb128fmt", + "wasmparser", +] + +[[package]] +name = "wasm-metadata" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" +dependencies = [ + "anyhow", + "indexmap", + "wasm-encoder", + "wasmparser", +] + +[[package]] +name = "wasmparser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" +dependencies = [ + "bitflags", + "hashbrown 0.15.5", + "indexmap", + "semver", +] + +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-core" +version = "0.62.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-link", + "windows-result", + "windows-strings", +] + +[[package]] +name = "windows-implement" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "windows-interface" +version = "0.59.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-result" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-strings" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-sys" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-targets" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" + +[[package]] +name = "windows_i686_gnu" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" + +[[package]] +name = "windows_i686_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" + +[[package]] +name = "wit-bindgen" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +dependencies = [ + "wit-bindgen-rust-macro", +] + +[[package]] +name = "wit-bindgen" +version = "0.57.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" + +[[package]] +name = "wit-bindgen-core" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" +dependencies = [ + "anyhow", + "heck", + "wit-parser", +] + +[[package]] +name = "wit-bindgen-rust" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" +dependencies = [ + "anyhow", + "heck", + "indexmap", + "prettyplease", + "syn 2.0.117", + "wasm-metadata", + "wit-bindgen-core", + "wit-component", +] + +[[package]] +name = "wit-bindgen-rust-macro" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" +dependencies = [ + "anyhow", + "prettyplease", + "proc-macro2", + "quote", + "syn 2.0.117", + "wit-bindgen-core", + "wit-bindgen-rust", +] + +[[package]] +name = "wit-component" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" +dependencies = [ + "anyhow", + "bitflags", + "indexmap", + "log", + "serde", + "serde_derive", + "serde_json", + "wasm-encoder", + "wasm-metadata", + "wasmparser", + "wit-parser", +] + +[[package]] +name = "wit-parser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" +dependencies = [ + "anyhow", + "id-arena", + "indexmap", + "log", + "semver", + "serde", + "serde_derive", + "serde_json", + "unicode-xid", + "wasmparser", +] + +[[package]] +name = "writeable" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" + +[[package]] +name = "yoke" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "709fe23a0424b6a435d82152b1bd3fdfb0833487d5fa90d05d42762a9891fef5" +dependencies = [ + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", + "synstructure", +] + +[[package]] +name = "zerocopy" +version = "0.8.52" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce1022995ff5ff5d841ad7d994facc23098cd40152f2c1d11cd607c6f530653f" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.52" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ae7f38b72ec2a254e2b87ef277cf2cd4fb97cbebf944faa6f33354da0867930" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "zerofrom" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ec05a11813ea801ff6d75110ad09cd0824ddba17dfe17128ea0d5f68e6c5272" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", + "synstructure", +] + +[[package]] +name = "zerotrie" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f9152d31db0792fa83f70fb2f83148effb5c1f5b8c7686c3459e361d9bc20bf" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", +] + +[[package]] +name = "zerovec" +version = "0.11.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90f911cbc359ab6af17377d242225f4d75119aec87ea711a880987b18cd7b239" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "zlib-rs" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513" + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" + +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.16+zstd.1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/examples/native/Cargo.toml b/examples/native/Cargo.toml new file mode 100644 index 0000000..d10298b --- /dev/null +++ b/examples/native/Cargo.toml @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 + +[package] +name = "datafusion-java-ffi-example" +version = "0.1.0" +edition = "2021" +publish = false + +[lib] +# Built as a cdylib so the JVM-side example can System.load() the artifact. +# `rlib` lets us add Rust-level unit tests if needed. +crate-type = ["cdylib", "rlib"] + +[dependencies] +arrow = { version = "58", features = ["ffi"] } +datafusion = { version = "53.1.0" } +datafusion-ffi = "53.1.0" +jni = "0.21" +tokio = { version = "1", features = ["rt-multi-thread"] } diff --git a/examples/native/src/lib.rs b/examples/native/src/lib.rs new file mode 100644 index 0000000..9d6406f --- /dev/null +++ b/examples/native/src/lib.rs @@ -0,0 +1,130 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Example cdylib that produces a small DataFusion `MemTable` wrapped as an +//! `FFI_TableProvider`, returned to the JVM as a `jlong` (the raw boxed +//! pointer). The JVM example uses `SessionContext.registerFfiTable(name, ptr)` +//! to install the provider on a DataFusion session and runs SQL against it. +//! +//! The same pattern is what domain bridges (Rerun, HDF5, custom Iceberg) use +//! to expose their TableProviders to DataFusion-Java — and, transitively, to +//! Spark via the connector-core DataSource V2 plumbing. + +use std::sync::Arc; + +use arrow::array::{Float64Array, Int64Array, RecordBatch, StringArray}; +use arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; +use datafusion::catalog::TableProvider; +use datafusion::datasource::MemTable; +use datafusion::execution::TaskContextProvider; +use datafusion::prelude::SessionContext; +use datafusion_ffi::execution::FFI_TaskContextProvider; +use datafusion_ffi::table_provider::FFI_TableProvider; +use jni::objects::JClass; +use jni::sys::jlong; +use jni::JNIEnv; +use tokio::runtime::{Handle, Runtime}; + +/// Tokio runtime that the FFI provider is anchored to. Shared across calls +/// for the lifetime of the cdylib so successive `createMemTableProvider` +/// invocations don't spawn fresh runtimes. +fn runtime() -> &'static Handle { + use std::sync::OnceLock; + static RT: OnceLock = OnceLock::new(); + RT.get_or_init(|| Runtime::new().expect("tokio runtime init failed")) + .handle() +} + +/// Throwaway `SessionContext` used only to obtain a `TaskContextProvider` +/// for `FFI_TableProvider::new`. The example does not register anything on it. +fn host_session_context() -> &'static Arc { + use std::sync::OnceLock; + static CTX: OnceLock> = OnceLock::new(); + CTX.get_or_init(|| Arc::new(SessionContext::new())) +} + +/// Build the example schema + a single-batch in-memory table. +fn build_mem_table() -> Result, Box> { + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int64, false), + Field::new("name", DataType::Utf8, true), + Field::new("value", DataType::Float64, true), + ])); + + let ids = Int64Array::from(vec![1, 2, 3, 4]); + let names = StringArray::from(vec![Some("alice"), Some("bob"), None, Some("dave")]); + let values = Float64Array::from(vec![Some(1.5), Some(2.5), Some(3.5), None]); + + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(ids), Arc::new(names), Arc::new(values)], + )?; + + Ok(Arc::new(MemTable::try_new(schema, vec![vec![batch]])?)) +} + +/// JNI entry point: build a small `MemTable`, wrap it in an `FFI_TableProvider`, +/// return the raw boxed pointer as a `jlong`. Ownership of the boxed FFI +/// transfers to the caller — the matching `Box::from_raw` is performed by +/// `SessionContext.registerFfiTable` on the consumer side. +#[no_mangle] +pub extern "system" fn Java_org_apache_datafusion_examples_FfiTableProviderExampleNative_createMemTableProvider<'local>( + mut env: JNIEnv<'local>, + _class: JClass<'local>, +) -> jlong { + let result: Result> = (|| { + let mem_table = build_mem_table()?; + let provider: Arc = mem_table; + + let ctx_provider: Arc = + Arc::clone(host_session_context()) as Arc; + let ffi_task_ctx = FFI_TaskContextProvider::from(&ctx_provider); + let ffi = FFI_TableProvider::new( + provider, + /*can_support_pushdown_filters=*/ true, + Some(runtime().clone()), + ffi_task_ctx, + /*logical_codec=*/ None, + ); + Ok(Box::into_raw(Box::new(ffi)) as jlong) + })(); + + match result { + Ok(ptr) => ptr, + Err(err) => { + let _ = env.throw_new("java/lang/RuntimeException", err.to_string()); + 0 + } + } +} + +/// Drop a previously-created FFI_TableProvider whose pointer was NOT handed +/// off to `registerFfiTable`. Exposed for symmetry — callers that pass the +/// pointer to `registerFfiTable` must NOT also call this; ownership has +/// already transferred. +#[no_mangle] +pub extern "system" fn Java_org_apache_datafusion_examples_FfiTableProviderExampleNative_dropProvider<'local>( + _env: JNIEnv<'local>, + _class: JClass<'local>, + ffi_ptr: jlong, +) { + if ffi_ptr != 0 { + unsafe { + drop(Box::from_raw(ffi_ptr as *mut FFI_TableProvider)); + } + } +} diff --git a/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExample.java b/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExample.java new file mode 100644 index 0000000..dcb8441 --- /dev/null +++ b/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExample.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datafusion.examples; + +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.ipc.ArrowReader; +import org.apache.datafusion.DataFrame; +import org.apache.datafusion.SessionContext; + +/** + * Register a Rust-built {@code FFI_TableProvider} on a {@link SessionContext} and run SQL against + * it. + * + *

The provider here wraps a tiny in-memory table (4 rows, 3 columns) built by the example cdylib + * under {@code examples/native}. The same {@link SessionContext#registerFfiTable(String, long)} + * entry point is what domain bridges (Rerun, HDF5, custom Iceberg) use to expose their native + * {@code TableProvider}s — and, transitively, what the Spark connector uses through the {@code + * FfiProviderFactory} interface in {@code connector-core} (see {@code + * examples/SPARK_INTEGRATION.md}). + * + *

How to run (from the fork repo root): + * + *

{@code
+ * (cd examples/native && cargo build --release)
+ * mvn -B install -DskipTests -Drat.skip=true \
+ *     -Ddatafusion.native.profile=release
+ * mvn -B -pl examples exec:exec \
+ *     -Dexec.mainClass=org.apache.datafusion.examples.FfiTableProviderExample
+ * }
+ * + *

The first {@code mvn install} step publishes {@code datafusion-java} to your local Maven repo + * so the separate {@code exec:exec} invocation can resolve it as a dependency. Skipping straight to + * {@code exec:exec} after a {@code package} build fails with {@code Could not find artifact + * org.apache.datafusion:datafusion-java:...}. + */ +public final class FfiTableProviderExample { + + private FfiTableProviderExample() {} + + public static void main(String[] args) throws Exception { + // Build the FFI provider on the Rust side. The returned `long` is a + // `Box::into_raw(Box::new(FFI_TableProvider))` pointer; ownership flows + // through `registerFfiTable` into the SessionContext. + long ffiProviderPtr = FfiTableProviderExampleNative.createMemTableProvider(); + if (ffiProviderPtr == 0) { + throw new IllegalStateException("Native FFI provider builder returned 0"); + } + + try (var allocator = new RootAllocator(); + var ctx = new SessionContext()) { + + // Hand the raw pointer to DataFusion. After this call, the SessionContext + // owns the boxed FFI_TableProvider; do NOT call dropProvider afterwards. + ctx.registerFfiTable("example_mem", ffiProviderPtr); + + // Filter pushdown crosses the FFI boundary transparently — DataFusion's + // optimizer rewrites the predicate into a TableProviderFilterPushDown + // call on the foreign provider, which a MemTable handles unsupported + // (the executor re-applies it above the scan). + try (DataFrame df = + ctx.sql("SELECT id, name, value FROM example_mem WHERE id > 1 ORDER BY id"); + ArrowReader reader = df.collect(allocator)) { + System.out.println("Result rows:"); + while (reader.loadNextBatch()) { + VectorSchemaRoot batch = reader.getVectorSchemaRoot(); + System.out.print(batch.contentToTSVString()); + } + } + } + } +} diff --git a/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExampleNative.java b/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExampleNative.java new file mode 100644 index 0000000..3ca2784 --- /dev/null +++ b/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExampleNative.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datafusion.examples; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Locale; + +/** + * JNI bindings into the example cdylib at {@code examples/native}. The cdylib produces a small + * {@code MemTable}-backed {@code FFI_TableProvider} that the JVM example registers on a {@link + * org.apache.datafusion.SessionContext} via {@link + * org.apache.datafusion.SessionContext#registerFfiTable(String, long)}. + * + *

The library is located in this order: + * + *

    + *
  1. Absolute path passed via {@code -Dexample.ffi.lib.path=/abs/path/to/lib...}. + *
  2. {@code examples/native/target/release/} relative to the current working + * directory (the default when invoked via {@code mvn exec:java} from the repo root). + *
  3. {@code examples/native/target/debug/} as a fallback for {@code cargo build} + * without {@code --release}. + *
+ * + *

If none of these exist, an {@link UnsatisfiedLinkError} surfaces with the search list so the + * user knows what to build. + */ +final class FfiTableProviderExampleNative { + + private static final String LIBRARY_NAME = "datafusion_java_ffi_example"; + + private FfiTableProviderExampleNative() {} + + static { + loadLibrary(); + } + + /** + * Build a tiny {@code MemTable} on the Rust side, wrap it in an {@code FFI_TableProvider}, and + * return the raw boxed pointer as a {@code long}. Ownership transfers to the caller; passing the + * pointer to {@link org.apache.datafusion.SessionContext#registerFfiTable(String, long)} + * discharges it. + */ + static native long createMemTableProvider(); + + /** + * Drop an FFI_TableProvider pointer that was NEVER handed to {@code + * SessionContext.registerFfiTable}. Call this only on the error path before registration; once + * {@code registerFfiTable} accepts the pointer it owns the box. + */ + static native void dropProvider(long ffiTableProviderPtr); + + private static void loadLibrary() { + String mapped = System.mapLibraryName(LIBRARY_NAME); + Path explicit = optionalPath(System.getProperty("example.ffi.lib.path")); + + // Cover both common cwds: repo root (mvn exec from datafusion-java/) and + // the examples module (mvn exec from datafusion-java/examples/). + Path[] candidates = + new Path[] { + explicit, + Paths.get("examples", "native", "target", "release", mapped), + Paths.get("examples", "native", "target", "debug", mapped), + Paths.get("native", "target", "release", mapped), + Paths.get("native", "target", "debug", mapped), + }; + + for (Path candidate : candidates) { + if (candidate != null && Files.exists(candidate)) { + System.load(candidate.toAbsolutePath().toString()); + return; + } + } + + StringBuilder searched = new StringBuilder(); + for (Path c : candidates) { + if (searched.length() > 0) searched.append(", "); + searched.append(c == null ? "null" : c.toAbsolutePath().toString()); + } + throw new UnsatisfiedLinkError( + String.format( + Locale.ROOT, + "Example native library %s not found. Searched: [%s]. " + + "Build with 'cd examples/native && cargo build --release', or pass " + + "-Dexample.ffi.lib.path=.", + mapped, + searched)); + } + + private static Path optionalPath(String s) { + return (s == null || s.isEmpty()) ? null : Paths.get(s); + } +} diff --git a/native/Cargo.lock b/native/Cargo.lock index 78ce6b9..93b2d0e 100644 --- a/native/Cargo.lock +++ b/native/Cargo.lock @@ -2,6 +2,54 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "abi_stable" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69d6512d3eb05ffe5004c59c206de7f99c34951504056ce23fc953842f12c445" +dependencies = [ + "abi_stable_derive", + "abi_stable_shared", + "const_panic", + "core_extensions", + "crossbeam-channel", + "generational-arena", + "libloading", + "lock_api", + "parking_lot", + "paste", + "repr_offset", + "rustc_version", + "serde", + "serde_derive", + "serde_json", +] + +[[package]] +name = "abi_stable_derive" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7178468b407a4ee10e881bc7a328a65e739f0863615cca4429d43916b05e898" +dependencies = [ + "abi_stable_shared", + "as_derive_utils", + "core_extensions", + "proc-macro2", + "quote", + "rustc_version", + "syn 1.0.109", + "typed-arena", +] + +[[package]] +name = "abi_stable_shared" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2b5df7688c123e63f4d4d649cba63f2967ba7f7861b1664fca3f77d3dad2b63" +dependencies = [ + "core_extensions", +] + [[package]] name = "adler2" version = "2.0.1" @@ -61,56 +109,6 @@ dependencies = [ "libc", ] -[[package]] -name = "anstream" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" -dependencies = [ - "anstyle", - "anstyle-parse", - "anstyle-query", - "anstyle-wincon", - "colorchoice", - "is_terminal_polyfill", - "utf8parse", -] - -[[package]] -name = "anstyle" -version = "1.0.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" - -[[package]] -name = "anstyle-parse" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" -dependencies = [ - "utf8parse", -] - -[[package]] -name = "anstyle-query" -version = "1.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" -dependencies = [ - "windows-sys 0.61.2", -] - -[[package]] -name = "anstyle-wincon" -version = "3.0.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" -dependencies = [ - "anstyle", - "once_cell_polyfill", - "windows-sys 0.61.2", -] - [[package]] name = "anyhow" version = "1.0.102" @@ -139,8 +137,8 @@ dependencies = [ "serde_bytes", "serde_json", "snap", - "strum 0.27.2", - "strum_macros 0.27.2", + "strum", + "strum_macros", "thiserror 2.0.18", "uuid", "zstd", @@ -155,12 +153,6 @@ dependencies = [ "object", ] -[[package]] -name = "array-init" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d62b7694a562cdf5a74227903507c56ab2cc8bdd1f781ed5cb4cf9c9f810bfc" - [[package]] name = "arrayref" version = "0.3.9" @@ -399,21 +391,15 @@ dependencies = [ ] [[package]] -name = "ascii" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d92bec98840b8f03a5ff5413de5293bfcd8bf96467cf5452609f939ec6f5de16" - -[[package]] -name = "async-channel" -version = "2.5.0" +name = "as_derive_utils" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "924ed96dd52d1b75e9c1a3e6275715fd320f5f9439fb5a4a11fa51f4221158d2" +checksum = "ff3c96645900a44cf11941c111bd08a6573b0e2f9f69bc9264b179d8fae753c4" dependencies = [ - "concurrent-queue", - "event-listener-strategy", - "futures-core", - "pin-project-lite", + "core_extensions", + "proc-macro2", + "quote", + "syn 1.0.109", ] [[package]] @@ -429,36 +415,23 @@ dependencies = [ ] [[package]] -name = "async-recursion" -version = "1.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "async-stream" -version = "0.3.6" +name = "async-ffi" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476" +checksum = "f4de21c0feef7e5a556e51af767c953f0501f7f300ba785cc99c47bdc8081a50" dependencies = [ - "async-stream-impl", - "futures-core", - "pin-project-lite", + "abi_stable", ] [[package]] -name = "async-stream-impl" -version = "0.3.6" +name = "async-recursion" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" +checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -469,7 +442,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -493,86 +466,6 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" -[[package]] -name = "aws-lc-rs" -version = "1.17.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ec2f1fc3ec205783a5da9a7e6c1509cc69dedf09a1949e412c1e18469326d00" -dependencies = [ - "aws-lc-sys", - "zeroize", -] - -[[package]] -name = "aws-lc-sys" -version = "0.41.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a2f9779ce85b93ab6170dd940ad0169b5766ff848247aff13bb788b832fe3f4" -dependencies = [ - "cc", - "cmake", - "dunce", - "fs_extra", -] - -[[package]] -name = "axum" -version = "0.8.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31b698c5f9a010f6573133b09e0de5408834d0c82f8d7475a89fc1867a71cd90" -dependencies = [ - "axum-core", - "bytes", - "form_urlencoded", - "futures-util", - "http", - "http-body", - "http-body-util", - "hyper", - "hyper-util", - "itoa", - "matchit", - "memchr", - "mime", - "percent-encoding", - "pin-project-lite", - "serde_core", - "serde_json", - "serde_path_to_error", - "serde_urlencoded", - "sync_wrapper", - "tokio", - "tower", - "tower-layer", - "tower-service", - "tracing", -] - -[[package]] -name = "axum-core" -version = "0.5.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08c78f31d7b1291f7ee735c1c6780ccde7785daae9a9206026862dab7d8792d1" -dependencies = [ - "bytes", - "futures-core", - "http", - "http-body", - "http-body-util", - "mime", - "pin-project-lite", - "sync_wrapper", - "tower-layer", - "tower-service", - "tracing", -] - -[[package]] -name = "az" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be5eb007b7cacc6c660343e96f650fedf4b5a77512399eb952ca6642cf8d13f7" - [[package]] name = "base64" version = "0.22.1" @@ -598,9 +491,6 @@ name = "bitflags" version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" -dependencies = [ - "bytemuck", -] [[package]] name = "blake2" @@ -656,7 +546,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn", + "syn 2.0.117", ] [[package]] @@ -686,26 +576,6 @@ version = "3.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" -[[package]] -name = "bytemuck" -version = "1.25.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec" -dependencies = [ - "bytemuck_derive", -] - -[[package]] -name = "bytemuck_derive" -version = "1.10.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9abbd1bc6865053c427f7198e6af43bfdedc55ab791faed4fbd361d789575ff" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "byteorder" version = "1.5.0" @@ -727,12 +597,6 @@ dependencies = [ "libbz2-rs-sys", ] -[[package]] -name = "camino" -version = "1.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e629a66d692cb9ff1a1c664e41771b3dcaf961985a9774c0eb0bd1b51cf60a48" - [[package]] name = "cc" version = "1.2.62" @@ -751,17 +615,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c" -[[package]] -name = "cfb" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d38f2da7a0a2c4ccf0065be06397cc26a81f4e528be095826eee9d4adbb8c60f" -dependencies = [ - "byteorder", - "fnv", - "uuid", -] - [[package]] name = "cfg-if" version = "1.0.4" @@ -792,10 +645,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0" dependencies = [ "iana-time-zone", - "js-sys", "num-traits", "serde", - "wasm-bindgen", "windows-link", ] @@ -806,61 +657,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6139a8597ed92cf816dfb33f5dd6cf0bb93a6adc938f11039f371bc5bcd26c3" dependencies = [ "chrono", - "phf 0.12.1", -] - -[[package]] -name = "chunked_transfer" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e4de3bc4ea267985becf712dc6d9eed8b04c953b3fcfb339ebc87acd9804901" - -[[package]] -name = "clap" -version = "4.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51" -dependencies = [ - "clap_builder", - "clap_derive", -] - -[[package]] -name = "clap_builder" -version = "4.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" -dependencies = [ - "anstream", - "anstyle", - "clap_lex", - "strsim", -] - -[[package]] -name = "clap_derive" -version = "4.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9" -dependencies = [ - "heck", - "proc-macro2", - "quote", - "syn", + "phf", ] -[[package]] -name = "clap_lex" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" - -[[package]] -name = "clean-path" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aaa6b4b263a5d737e9bf6b7c09b72c41a5480aec4d7219af827f6564e950b6a5" - [[package]] name = "cmake" version = "0.1.58" @@ -870,12 +669,6 @@ dependencies = [ "cc", ] -[[package]] -name = "colorchoice" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" - [[package]] name = "combine" version = "4.6.7" @@ -892,7 +685,6 @@ version = "7.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "958c5d6ecf1f214b4c2bbbbf6ab9523a864bd136dcf71a7e8904799acfe1ad47" dependencies = [ - "crossterm", "unicode-segmentation", "unicode-width", ] @@ -918,15 +710,6 @@ version = "0.4.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cc14f565cf027a105f7a44ccf9e5b424348421a1d8952a8fc9d499d313107789" -[[package]] -name = "concurrent-queue" -version = "2.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ca0197aee26d1ae37445ee532fefce43251d24cc7c166799f4d46817f1d3973" -dependencies = [ - "crossbeam-utils", -] - [[package]] name = "const-random" version = "0.1.18" @@ -948,19 +731,19 @@ dependencies = [ ] [[package]] -name = "constant_time_eq" -version = "0.4.2" +name = "const_panic" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" +checksum = "e262cdaac42494e3ae34c43969f9cdeb7da178bdb4b66fa6a1ea2edb4c8ae652" +dependencies = [ + "typewit", +] [[package]] -name = "convert_case" -version = "0.11.0" +name = "constant_time_eq" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "affbf0190ed2caf063e3def54ff444b449371d55c58e513a95ab98eca50adb49" -dependencies = [ - "unicode-segmentation", -] +checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" [[package]] name = "core-foundation" @@ -978,6 +761,21 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" +[[package]] +name = "core_extensions" +version = "1.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42bb5e5d0269fd4f739ea6cedaf29c16d81c27a7ce7582008e90eb50dcd57003" +dependencies = [ + "core_extensions_proc_macros", +] + +[[package]] +name = "core_extensions_proc_macros" +version = "1.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "533d38ecd2709b7608fb8e18e4504deb99e9a72879e6aa66373a76d8dc4259ea" + [[package]] name = "cpufeatures" version = "0.2.17" @@ -1005,19 +803,6 @@ dependencies = [ "cfg-if", ] -[[package]] -name = "crossbeam" -version = "0.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1137cd7e7fc0fb5d3c5a8678be38ec56e819125d8d7907411fe24ccb943faca8" -dependencies = [ - "crossbeam-channel", - "crossbeam-deque", - "crossbeam-epoch", - "crossbeam-queue", - "crossbeam-utils", -] - [[package]] name = "crossbeam-channel" version = "0.5.15" @@ -1028,73 +813,22 @@ dependencies = [ ] [[package]] -name = "crossbeam-deque" -version = "0.8.6" +name = "crossbeam-utils" +version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" -dependencies = [ - "crossbeam-epoch", - "crossbeam-utils", -] +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" [[package]] -name = "crossbeam-epoch" -version = "0.9.18" +name = "crunchy" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" -dependencies = [ - "crossbeam-utils", -] +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" [[package]] -name = "crossbeam-queue" -version = "0.3.12" +name = "crypto-common" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f58bbc28f91df819d0aa2a2c00cd19754769c2fad90579b3592b1c9ba7a3115" -dependencies = [ - "crossbeam-utils", -] - -[[package]] -name = "crossbeam-utils" -version = "0.8.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" - -[[package]] -name = "crossterm" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8b9f2e4c67f833b660cdb0a3523065869fb35570177239812ed4c905aeff87b" -dependencies = [ - "bitflags", - "crossterm_winapi", - "document-features", - "parking_lot", - "rustix", - "winapi", -] - -[[package]] -name = "crossterm_winapi" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acdd7c62a3665c7f6830a51635d9ac9b23ed385797f70a83bb8bafe9c572ab2b" -dependencies = [ - "winapi", -] - -[[package]] -name = "crunchy" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" - -[[package]] -name = "crypto-common" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" +checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" dependencies = [ "generic-array", "typenum", @@ -1141,7 +875,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn", + "syn 2.0.117", ] [[package]] @@ -1152,7 +886,7 @@ checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" dependencies = [ "darling_core", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -1209,7 +943,7 @@ dependencies = [ "datafusion-sql", "flate2", "futures", - "itertools 0.14.0", + "itertools", "liblzma", "log", "object_store", @@ -1243,7 +977,7 @@ dependencies = [ "datafusion-physical-plan", "datafusion-session", "futures", - "itertools 0.14.0", + "itertools", "log", "object_store", "parking_lot", @@ -1268,7 +1002,7 @@ dependencies = [ "datafusion-physical-expr-common", "datafusion-physical-plan", "futures", - "itertools 0.14.0", + "itertools", "log", "object_store", ] @@ -1287,7 +1021,7 @@ dependencies = [ "half", "hashbrown 0.16.1", "indexmap", - "itertools 0.14.0", + "itertools", "libc", "log", "object_store", @@ -1334,7 +1068,7 @@ dependencies = [ "flate2", "futures", "glob", - "itertools 0.14.0", + "itertools", "liblzma", "log", "object_store", @@ -1364,7 +1098,7 @@ dependencies = [ "datafusion-physical-plan", "datafusion-session", "futures", - "itertools 0.14.0", + "itertools", "object_store", "tokio", ] @@ -1458,7 +1192,7 @@ dependencies = [ "datafusion-pruning", "datafusion-session", "futures", - "itertools 0.14.0", + "itertools", "log", "object_store", "parking_lot", @@ -1511,7 +1245,7 @@ dependencies = [ "datafusion-functions-window-common", "datafusion-physical-expr-common", "indexmap", - "itertools 0.14.0", + "itertools", "paste", "recursive", "serde_json", @@ -1527,10 +1261,40 @@ dependencies = [ "arrow", "datafusion-common", "indexmap", - "itertools 0.14.0", + "itertools", "paste", ] +[[package]] +name = "datafusion-ffi" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b95173344d04ba62755c949bf44f8d1a6e4414cf6392a635db96c07e711b9a3c" +dependencies = [ + "abi_stable", + "arrow", + "arrow-schema", + "async-ffi", + "async-trait", + "datafusion-catalog", + "datafusion-common", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-aggregate-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-proto", + "datafusion-proto-common", + "datafusion-session", + "futures", + "log", + "prost", + "semver", + "tokio", +] + [[package]] name = "datafusion-functions" version = "53.1.0" @@ -1551,7 +1315,7 @@ dependencies = [ "datafusion-expr-common", "datafusion-macros", "hex", - "itertools 0.14.0", + "itertools", "log", "md-5", "memchr", @@ -1617,7 +1381,7 @@ dependencies = [ "datafusion-macros", "datafusion-physical-expr-common", "hashbrown 0.16.1", - "itertools 0.14.0", + "itertools", "itoa", "log", "paste", @@ -1674,6 +1438,7 @@ dependencies = [ "arrow", "async-trait", "datafusion", + "datafusion-ffi", "datafusion-proto", "datafusion-substrait", "futures", @@ -1682,15 +1447,6 @@ dependencies = [ "prost", "prost-build", "protoc-bin-vendored", - "re_auth", - "re_dataframe", - "re_datafusion", - "re_log_types", - "re_protos", - "re_redap_client", - "re_types_core", - "re_uri", - "rustls", "tokio", "tokio-metrics", "url", @@ -1704,7 +1460,7 @@ checksum = "2e367e6a71051d0ebdd29b2f85d12059b38b1d1f172c6906e80016da662226bd" dependencies = [ "datafusion-doc", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -1720,7 +1476,7 @@ dependencies = [ "datafusion-expr-common", "datafusion-physical-expr", "indexmap", - "itertools 0.14.0", + "itertools", "log", "recursive", "regex", @@ -1743,7 +1499,7 @@ dependencies = [ "half", "hashbrown 0.16.1", "indexmap", - "itertools 0.14.0", + "itertools", "parking_lot", "paste", "petgraph", @@ -1763,7 +1519,7 @@ dependencies = [ "datafusion-functions", "datafusion-physical-expr", "datafusion-physical-expr-common", - "itertools 0.14.0", + "itertools", ] [[package]] @@ -1779,7 +1535,7 @@ dependencies = [ "datafusion-expr-common", "hashbrown 0.16.1", "indexmap", - "itertools 0.14.0", + "itertools", "parking_lot", ] @@ -1798,7 +1554,7 @@ dependencies = [ "datafusion-physical-expr-common", "datafusion-physical-plan", "datafusion-pruning", - "itertools 0.14.0", + "itertools", "recursive", ] @@ -1826,7 +1582,7 @@ dependencies = [ "half", "hashbrown 0.16.1", "indexmap", - "itertools 0.14.0", + "itertools", "log", "num-traits", "parking_lot", @@ -1886,7 +1642,7 @@ dependencies = [ "datafusion-physical-expr", "datafusion-physical-expr-common", "datafusion-physical-plan", - "itertools 0.14.0", + "itertools", "log", ] @@ -1934,7 +1690,7 @@ dependencies = [ "chrono", "datafusion", "half", - "itertools 0.14.0", + "itertools", "object_store", "pbjson-types", "prost", @@ -1954,27 +1710,6 @@ dependencies = [ "subtle", ] -[[package]] -name = "directories" -version = "6.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16f5094c54661b38d03bd7e50df373292118db60b585c08a411c6d840017fe7d" -dependencies = [ - "dirs-sys", -] - -[[package]] -name = "dirs-sys" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e01a3366d27ee9890022452ee61b2b63a67e6f13f58900b651ff5665f0bb1fab" -dependencies = [ - "libc", - "option-ext", - "redox_users", - "windows-sys 0.61.2", -] - [[package]] name = "displaydoc" version = "0.2.5" @@ -1983,65 +1718,21 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn", -] - -[[package]] -name = "document-features" -version = "0.2.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4b8a88685455ed29a21542a33abd9cb6510b6b129abadabdcef0f4c55bc8f61" -dependencies = [ - "litrs", + "syn 2.0.117", ] -[[package]] -name = "dtoa" -version = "1.0.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c3cf4824e2d5f025c7b531afcb2325364084a16806f6d47fbc1f5fbd9960590" - -[[package]] -name = "dunce" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" - [[package]] name = "dyn-clone" version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555" -[[package]] -name = "ehttp" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2f1b93eb2e039aaff63ce07cca59bd1dca02f2ce30075a17b619d2c42f56efc" -dependencies = [ - "async-channel", - "document-features", - "js-sys", - "serde", - "serde_json", - "ureq", - "wasm-bindgen", - "wasm-bindgen-futures", - "web-sys", -] - [[package]] name = "either" version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" -[[package]] -name = "emath" -version = "0.34.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b53f0d33a479321da6b0caa71366c9f67e8a2c149762d90bdc0d16e601ee8ecb" - [[package]] name = "equivalent" version = "1.0.2" @@ -2058,27 +1749,6 @@ dependencies = [ "windows-sys 0.61.2", ] -[[package]] -name = "event-listener" -version = "5.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e13b66accf52311f30a0db42147dadea9850cb48cd070028831ae5f5d4b856ab" -dependencies = [ - "concurrent-queue", - "parking", - "pin-project-lite", -] - -[[package]] -name = "event-listener-strategy" -version = "0.5.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8be9f3dfaaffdae2972880079a491a1a8bb7cbed0b8dd7a347f668b4150a3b93" -dependencies = [ - "event-listener", - "pin-project-lite", -] - [[package]] name = "fastrand" version = "2.4.1" @@ -2091,19 +1761,6 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" -[[package]] -name = "fixed" -version = "1.31.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9af2cbf772fa6d1c11358f92ef554cb6b386201210bcf0e91fb7fba8a907fb40" -dependencies = [ - "az", - "bytemuck", - "half", - "serde", - "typenum", -] - [[package]] name = "fixedbitset" version = "0.5.7" @@ -2158,12 +1815,6 @@ dependencies = [ "percent-encoding", ] -[[package]] -name = "fs_extra" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" - [[package]] name = "futures" version = "0.3.32" @@ -2220,7 +1871,7 @@ checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -2252,6 +1903,15 @@ dependencies = [ "slab", ] +[[package]] +name = "generational-arena" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877e94aff08e743b651baaea359664321055749b398adff8740a7399af7796e7" +dependencies = [ + "cfg-if", +] + [[package]] name = "generic-array" version = "0.14.7" @@ -2334,7 +1994,6 @@ version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" dependencies = [ - "bytemuck", "cfg-if", "crunchy", "num-traits", @@ -2385,15 +2044,6 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" -[[package]] -name = "hmac" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" -dependencies = [ - "digest", -] - [[package]] name = "http" version = "1.4.0" @@ -2433,12 +2083,6 @@ version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" -[[package]] -name = "httpdate" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" - [[package]] name = "humantime" version = "2.3.0" @@ -2459,7 +2103,6 @@ dependencies = [ "http", "http-body", "httparse", - "httpdate", "itoa", "pin-project-lite", "smallvec", @@ -2481,20 +2124,6 @@ dependencies = [ "tokio", "tokio-rustls", "tower-service", - "webpki-roots", -] - -[[package]] -name = "hyper-timeout" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0" -dependencies = [ - "hyper", - "hyper-util", - "pin-project-lite", - "tokio", - "tower-service", ] [[package]] @@ -2659,12 +2288,6 @@ dependencies = [ "icu_properties", ] -[[package]] -name = "indent" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9f1a0777d972970f204fdf8ef319f1f4f8459131636d7e3c96c5d59570d0fa6" - [[package]] name = "indexmap" version = "2.14.0" @@ -2677,15 +2300,6 @@ dependencies = [ "serde_core", ] -[[package]] -name = "infer" -version = "0.16.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc150e5ce2330295b8616ce0e3f53250e53af31759a9dbedad1621ba29151847" -dependencies = [ - "cfb", -] - [[package]] name = "integer-encoding" version = "3.0.4" @@ -2698,21 +2312,6 @@ version = "2.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2" -[[package]] -name = "is_terminal_polyfill" -version = "1.70.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" - -[[package]] -name = "itertools" -version = "0.10.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" -dependencies = [ - "either", -] - [[package]] name = "itertools" version = "0.14.0" @@ -2728,49 +2327,6 @@ version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" -[[package]] -name = "jiff" -version = "0.2.28" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4603d3033e49e2b0e31229fcab20a5d40089c607d975cd9c80551dc69eed9102" -dependencies = [ - "jiff-static", - "jiff-tzdb-platform", - "js-sys", - "log", - "portable-atomic", - "portable-atomic-util", - "serde_core", - "wasm-bindgen", - "windows-link", -] - -[[package]] -name = "jiff-static" -version = "0.2.28" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "782d32378dddf207193ac91cefb848ad41abb58195c95168e1291227a0832b47" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "jiff-tzdb" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c900ef84826f1338a557697dc8fc601df9ca9af4ac137c7fb61d4c6f2dfd3076" - -[[package]] -name = "jiff-tzdb-platform" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "875a5a69ac2bab1a891711cf5eccbec1ce0341ea805560dcd90b7a2e925132e8" -dependencies = [ - "jiff-tzdb", -] - [[package]] name = "jni" version = "0.21.1" @@ -2812,7 +2368,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "38c0b942f458fe50cdac086d2f946512305e5631e720728f2a61aabcd47a6264" dependencies = [ "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -2837,27 +2393,6 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "jsonwebtoken" -version = "10.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eba32bfb4ffdeaca3e34431072faf01745c9b26d25504aa7a6cf5684334fc4fc" -dependencies = [ - "base64", - "getrandom 0.2.17", - "js-sys", - "serde", - "serde_json", - "signature", - "zeroize", -] - -[[package]] -name = "lazy_static" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" - [[package]] name = "leb128fmt" version = "0.1.0" @@ -2933,6 +2468,16 @@ version = "0.2.186" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" +[[package]] +name = "libloading" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" +dependencies = [ + "cfg-if", + "winapi", +] + [[package]] name = "liblzma" version = "0.4.6" @@ -2959,15 +2504,6 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" -[[package]] -name = "libredox" -version = "0.1.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f02ab6bace2054fb888a3c16f990117b579d14a3088e472d63c6011fa185c9d3" -dependencies = [ - "libc", -] - [[package]] name = "linux-raw-sys" version = "0.12.1" @@ -2980,12 +2516,6 @@ version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0" -[[package]] -name = "litrs" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11d3d7f243d5c5a8b9bb5d6dd2b1602c0cb0b9db1621bafc7ed66e35ff9fe092" - [[package]] name = "lock_api" version = "0.4.14" @@ -2993,7 +2523,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" dependencies = [ "scopeguard", - "serde", ] [[package]] @@ -3002,15 +2531,6 @@ version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" -[[package]] -name = "log-once" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d8a05e3879b317b1b6dbf353e5bba7062bedcc59815267bb23eaa0c576cebf0" -dependencies = [ - "log", -] - [[package]] name = "lru-slab" version = "0.1.2" @@ -3026,31 +2546,6 @@ dependencies = [ "twox-hash", ] -[[package]] -name = "matchers" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" -dependencies = [ - "regex-automata", -] - -[[package]] -name = "matchit" -version = "0.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3" - -[[package]] -name = "matrixmultiply" -version = "0.3.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a06de3016e9fae57a36fd14dba131fccf49f74b40b7fbdb472f96e361ec71a08" -dependencies = [ - "autocfg", - "rawpointer", -] - [[package]] name = "md-5" version = "0.10.6" @@ -3067,34 +2562,6 @@ version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" -[[package]] -name = "memory-stats" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c73f5c649995a115e1a0220b35e4df0a1294500477f97a91d0660fb5abeb574a" -dependencies = [ - "libc", - "windows-sys 0.52.0", -] - -[[package]] -name = "mime" -version = "0.3.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" - -[[package]] -name = "mime_guess2" -version = "2.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1706dc14a2e140dec0a7a07109d9a3d5890b81e85bd6c60b906b249a77adf0ca" -dependencies = [ - "mime", - "phf 0.11.3", - "phf_shared 0.11.3", - "unicase", -] - [[package]] name = "miniz_oxide" version = "0.8.9" @@ -3122,42 +2589,6 @@ version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" -[[package]] -name = "natord" -version = "1.0.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "308d96db8debc727c3fd9744aac51751243420e46edf401010908da7f8d5e57c" - -[[package]] -name = "ndarray" -version = "0.16.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "882ed72dce9365842bf196bdeedf5055305f11fc8c03dee7bb0194a6cad34841" -dependencies = [ - "matrixmultiply", - "num-complex", - "num-integer", - "num-traits", - "portable-atomic", - "portable-atomic-util", - "rawpointer", -] - -[[package]] -name = "nohash-hasher" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2bf50223579dc7cdcfb3bfcacf7069ff68243f8c363f62ffa99cf000a6b9c451" - -[[package]] -name = "nu-ansi-term" -version = "0.50.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" -dependencies = [ - "windows-sys 0.61.2", -] - [[package]] name = "num-bigint" version = "0.4.6" @@ -3178,17 +2609,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "num-derive" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "num-integer" version = "0.1.46" @@ -3235,7 +2655,7 @@ dependencies = [ "http-body-util", "humantime", "hyper", - "itertools 0.14.0", + "itertools", "md-5", "parking_lot", "percent-encoding", @@ -3262,113 +2682,12 @@ version = "1.21.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" -[[package]] -name = "once_cell_polyfill" -version = "1.70.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" - [[package]] name = "openssl-probe" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" -[[package]] -name = "opentelemetry" -version = "0.31.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b84bcd6ae87133e903af7ef497404dda70c60d0ea14895fc8a5e6722754fc2a0" -dependencies = [ - "futures-core", - "futures-sink", - "js-sys", - "pin-project-lite", - "thiserror 2.0.18", - "tracing", -] - -[[package]] -name = "opentelemetry-appender-tracing" -version = "0.31.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef6a1ac5ca3accf562b8c306fa8483c85f4390f768185ab775f242f7fe8fdcc2" -dependencies = [ - "opentelemetry", - "tracing", - "tracing-core", - "tracing-opentelemetry", - "tracing-subscriber", -] - -[[package]] -name = "opentelemetry-http" -version = "0.31.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7a6d09a73194e6b66df7c8f1b680f156d916a1a942abf2de06823dd02b7855d" -dependencies = [ - "async-trait", - "bytes", - "http", - "opentelemetry", - "reqwest", -] - -[[package]] -name = "opentelemetry-otlp" -version = "0.31.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f69cd6acbb9af919df949cd1ec9e5e7fdc2ef15d234b6b795aaa525cc02f71f" -dependencies = [ - "http", - "opentelemetry", - "opentelemetry-http", - "opentelemetry-proto", - "opentelemetry_sdk", - "prost", - "reqwest", - "thiserror 2.0.18", - "tokio", - "tonic", - "tracing", -] - -[[package]] -name = "opentelemetry-proto" -version = "0.31.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7175df06de5eaee9909d4805a3d07e28bb752c34cab57fa9cff549da596b30f" -dependencies = [ - "opentelemetry", - "opentelemetry_sdk", - "prost", - "tonic", - "tonic-prost", -] - -[[package]] -name = "opentelemetry_sdk" -version = "0.31.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e14ae4f5991976fd48df6d843de219ca6d31b01daaab2dad5af2badeded372bd" -dependencies = [ - "futures-channel", - "futures-executor", - "futures-util", - "opentelemetry", - "percent-encoding", - "rand 0.9.4", - "thiserror 2.0.18", - "tokio", - "tokio-stream", -] - -[[package]] -name = "option-ext" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" - [[package]] name = "ordered-float" version = "2.10.1" @@ -3378,12 +2697,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "parking" -version = "2.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba" - [[package]] name = "parking_lot" version = "0.12.5" @@ -3466,7 +2779,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af22d08a625a2213a78dbb0ffa253318c5c79ce3133d32d296655a7bdfb02095" dependencies = [ "heck", - "itertools 0.14.0", + "itertools", "prost", "prost-types", ] @@ -3486,33 +2799,6 @@ dependencies = [ "serde", ] -[[package]] -name = "peg" -version = "0.8.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0aad070be5b63aa72103f2fcdd70a83adbd5e90112ce5b574171ff1c65501773" -dependencies = [ - "peg-macros", - "peg-runtime", -] - -[[package]] -name = "peg-macros" -version = "0.8.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddd8ef6825cae95355031ae26a99b616a2a21f22ba2de0197c43dfb05acbe7ee" -dependencies = [ - "peg-runtime", - "proc-macro2", - "quote", -] - -[[package]] -name = "peg-runtime" -version = "0.8.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7011d97b484a5ebdc4b1fdb3b12d5e4bbbea56e9d22b688f2e79e04b65a7d8a6" - [[package]] name = "percent-encoding" version = "2.3.2" @@ -3531,57 +2817,13 @@ dependencies = [ "serde", ] -[[package]] -name = "phf" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" -dependencies = [ - "phf_macros", - "phf_shared 0.11.3", -] - [[package]] name = "phf" version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "913273894cec178f401a31ec4b656318d95473527be05c0752cc41cdc32be8b7" dependencies = [ - "phf_shared 0.12.1", -] - -[[package]] -name = "phf_generator" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" -dependencies = [ - "phf_shared 0.11.3", - "rand 0.8.6", -] - -[[package]] -name = "phf_macros" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216" -dependencies = [ - "phf_generator", - "phf_shared 0.11.3", - "proc-macro2", - "quote", - "syn", - "unicase", -] - -[[package]] -name = "phf_shared" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" -dependencies = [ - "siphasher", - "unicase", + "phf_shared", ] [[package]] @@ -3593,26 +2835,6 @@ dependencies = [ "siphasher", ] -[[package]] -name = "pin-project" -version = "1.1.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2466b2336ed02bcdca6b294417127b90ec92038d1d5c4fbeac971a922e0e0924" -dependencies = [ - "pin-project-internal", -] - -[[package]] -name = "pin-project-internal" -version = "1.1.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c96395f0a926bc13b1c17622aaddda1ecb55d49c8f1bf9777e4d877800a43f8b" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "pin-project-lite" version = "0.2.17" @@ -3625,32 +2847,6 @@ version = "0.3.33" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e" -[[package]] -name = "ply-rs-bw" -version = "3.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe55bbee2b70d1c1e58d8340eda9a80c5ce11fb9b1bc10b5fc1575c490d38fa9" -dependencies = [ - "byteorder", - "indexmap", - "peg", -] - -[[package]] -name = "portable-atomic" -version = "1.13.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" - -[[package]] -name = "portable-atomic-util" -version = "0.2.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2a106d1259c23fac8e543272398ae0e3c0b8d33c88ed73d0cc71b0f1d902618" -dependencies = [ - "portable-atomic", -] - [[package]] name = "potential_utf" version = "0.1.5" @@ -3676,16 +2872,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" dependencies = [ "proc-macro2", - "syn", -] - -[[package]] -name = "proc-macro-crate" -version = "3.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e67ba7e9b2b56446f1d419b1d807906278ffa1a658a8a5d8a39dcb1f5a78614f" -dependencies = [ - "toml_edit", + "syn 2.0.117", ] [[package]] @@ -3697,29 +2884,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "prometheus-client" -version = "0.24.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cca3d75b4566b9a29fe1ed623587fb058e826eb329a0be4b7c4da1ebb2d7a6ca" -dependencies = [ - "dtoa", - "itoa", - "parking_lot", - "prometheus-client-derive-encode", -] - -[[package]] -name = "prometheus-client-derive-encode" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9adf1691c04c0a5ff46ff8f262b58beb07b0dbb61f96f9f54f6cbd82106ed87f" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "prost" version = "0.14.3" @@ -3737,7 +2901,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7" dependencies = [ "heck", - "itertools 0.14.0", + "itertools", "log", "multimap", "petgraph", @@ -3745,7 +2909,7 @@ dependencies = [ "prost", "prost-types", "regex", - "syn", + "syn 2.0.117", "tempfile", ] @@ -3756,10 +2920,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" dependencies = [ "anyhow", - "itertools 0.14.0", + "itertools", "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -3854,20 +3018,6 @@ dependencies = [ "cc", ] -[[package]] -name = "puffin" -version = "0.19.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa9dae7b05c02ec1a6bc9bcf20d8bc64a7dcbf57934107902a872014899b741f" -dependencies = [ - "anyhow", - "byteorder", - "cfg-if", - "itertools 0.10.5", - "once_cell", - "parking_lot", -] - [[package]] name = "quad-rand" version = "0.2.3" @@ -3960,15 +3110,6 @@ version = "6.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" -[[package]] -name = "rand" -version = "0.8.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ca0ecfa931c29007047d1bc58e623ab12e5590e8c7cc53200d5202b69266d8a" -dependencies = [ - "rand_core 0.6.4", -] - [[package]] name = "rand" version = "0.9.4" @@ -4000,15 +3141,6 @@ dependencies = [ "rand_core 0.9.5", ] -[[package]] -name = "rand_core" -version = "0.6.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" -dependencies = [ - "getrandom 0.2.17", -] - [[package]] name = "rand_core" version = "0.9.5" @@ -4025,720 +3157,39 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "63b8176103e19a2643978565ca18b50549f6101881c443590420e4dc998a3c69" [[package]] -name = "rawpointer" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" - -[[package]] -name = "rayon" -version = "1.12.0" +name = "recursive" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d" +checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e" dependencies = [ - "either", - "rayon-core", + "recursive-proc-macro-impl", + "stacker", ] [[package]] -name = "rayon-core" -version = "1.13.0" +name = "recursive-proc-macro-impl" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" dependencies = [ - "crossbeam-deque", - "crossbeam-utils", + "quote", + "syn 2.0.117", ] [[package]] -name = "re_analytics" -version = "0.34.0-alpha.1+dev" +name = "redox_syscall" +version = "0.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" dependencies = [ - "crossbeam", - "directories", - "ehttp", - "jiff", - "re_build_info", - "re_log", - "re_quota_channel", - "serde", - "serde_json", - "sha2", - "thiserror 2.0.18", - "url", - "uuid", - "web-sys", + "bitflags", ] [[package]] -name = "re_arrow_util" -version = "0.34.0-alpha.1+dev" -dependencies = [ - "anyhow", - "arrow", - "comfy-table", - "half", - "itertools 0.14.0", - "re_log", - "re_tracing", - "re_tuid", - "serde_json", - "thiserror 2.0.18", -] - -[[package]] -name = "re_auth" -version = "0.34.0-alpha.1+dev" -dependencies = [ - "async-trait", - "base64", - "directories", - "ehttp", - "getrandom 0.2.17", - "getrandom 0.3.4", - "hmac", - "http", - "jiff", - "js-sys", - "jsonwebtoken", - "parking_lot", - "rand 0.9.4", - "re_analytics", - "re_log", - "ring", - "saturating_cast", - "serde", - "serde_json", - "sha2", - "signature", - "thiserror 2.0.18", - "tiny_http", - "tokio", - "tonic", - "tower", - "url", - "uuid", - "wasm-bindgen", - "web-sys", -] - -[[package]] -name = "re_backoff" -version = "0.34.0-alpha.1+dev" -dependencies = [ - "getrandom 0.3.4", - "js-sys", - "rand 0.9.4", - "tokio", - "wasm-bindgen-futures", - "web-sys", -] - -[[package]] -name = "re_build_info" -version = "0.34.0-alpha.1+dev" -dependencies = [ - "re_byte_size", - "serde", -] - -[[package]] -name = "re_byte_size" -version = "0.34.0-alpha.1+dev" -dependencies = [ - "arrow", - "half", - "parking_lot", - "re_byte_size_derive", - "smallvec", - "vec1", -] - -[[package]] -name = "re_byte_size_derive" -version = "0.34.0-alpha.1+dev" -dependencies = [ - "proc-macro-crate", - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "re_case" -version = "0.34.0-alpha.1+dev" -dependencies = [ - "convert_case", -] - -[[package]] -name = "re_chunk" -version = "0.34.0-alpha.1+dev" -dependencies = [ - "ahash", - "anyhow", - "arrow", - "bytemuck", - "crossbeam", - "document-features", - "half", - "itertools 0.14.0", - "nohash-hasher", - "rand 0.9.4", - "re_arrow_util", - "re_byte_size", - "re_error", - "re_format", - "re_log", - "re_log_types", - "re_quota_channel", - "re_sorbet", - "re_span", - "re_tracing", - "re_types_core", - "thiserror 2.0.18", -] - -[[package]] -name = "re_chunk_store" -version = "0.34.0-alpha.1+dev" -dependencies = [ - "ahash", - "anyhow", - "arrow", - "document-features", - "indent", - "itertools 0.14.0", - "nohash-hasher", - "parking_lot", - "re_arrow_util", - "re_byte_size", - "re_chunk", - "re_format", - "re_log", - "re_log_encoding", - "re_log_types", - "re_sdk_types", - "re_sorbet", - "re_tracing", - "re_types_core", - "saturating_cast", - "thiserror 2.0.18", - "web-time", -] - -[[package]] -name = "re_dataframe" -version = "0.34.0-alpha.1+dev" -dependencies = [ - "anyhow", - "arrow", - "itertools 0.14.0", - "nohash-hasher", - "rayon", - "re_arrow_util", - "re_chunk", - "re_chunk_store", - "re_log", - "re_log_types", - "re_query", - "re_sorbet", - "re_span", - "re_tracing", - "re_types_core", - "tracing", -] - -[[package]] -name = "re_datafusion" -version = "0.34.0-alpha.1+dev" -dependencies = [ - "ahash", - "arrow", - "async-stream", - "async-trait", - "chrono", - "datafusion", - "futures", - "futures-util", - "getrandom 0.3.4", - "http", - "itertools 0.14.0", - "jiff", - "opentelemetry", - "opentelemetry-proto", - "parking_lot", - "re_analytics", - "re_arrow_util", - "re_backoff", - "re_byte_size", - "re_dataframe", - "re_format", - "re_log", - "re_log_encoding", - "re_log_types", - "re_perf_telemetry", - "re_protos", - "re_redap_client", - "re_sorbet", - "re_tracing", - "re_types_core", - "re_uri", - "reqwest", - "tokio", - "tokio-stream", - "tonic", - "tonic-prost", - "tracing", - "wasm-bindgen-futures", - "web-time", -] - -[[package]] -name = "re_error" -version = "0.34.0-alpha.1+dev" - -[[package]] -name = "re_format" -version = "0.34.0-alpha.1+dev" -dependencies = [ - "half", - "itertools 0.14.0", - "num-traits", - "re_log", -] - -[[package]] -name = "re_grpc_headers" -version = "0.34.0-alpha.1+dev" -dependencies = [ - "http", - "pin-project-lite", - "tonic", - "tower", -] - -[[package]] -name = "re_log" -version = "0.34.0-alpha.1+dev" -dependencies = [ - "crossbeam", - "log", - "log-once", - "parking_lot", - "tracing", - "tracing-log", - "tracing-subscriber", - "tracing-web", -] - -[[package]] -name = "re_log_channel" -version = "0.34.0-alpha.1+dev" -dependencies = [ - "camino", - "crossbeam", - "futures", - "parking_lot", - "re_byte_size", - "re_log_encoding", - "re_log_types", - "re_quota_channel", - "re_tracing", - "re_uri", - "serde", - "thiserror 2.0.18", -] - -[[package]] -name = "re_log_encoding" -version = "0.34.0-alpha.1+dev" -dependencies = [ - "arrow", - "bytes", - "crossbeam", - "itertools 0.14.0", - "lz4_flex", - "parking_lot", - "re_arrow_util", - "re_build_info", - "re_byte_size", - "re_chunk", - "re_log", - "re_log_types", - "re_protos", - "re_quota_channel", - "re_sorbet", - "re_span", - "re_tracing", - "re_types_core", - "sha2", - "thiserror 2.0.18", - "tokio", - "tokio-stream", - "tracing", - "xxhash-rust", -] - -[[package]] -name = "re_log_types" -version = "0.34.0-alpha.1+dev" -dependencies = [ - "ahash", - "arrow", - "bytemuck", - "clean-path", - "document-features", - "fixed", - "half", - "itertools 0.14.0", - "jiff", - "natord", - "nohash-hasher", - "num-derive", - "num-traits", - "parking_lot", - "re_arrow_util", - "re_build_info", - "re_byte_size", - "re_format", - "re_log", - "re_string_interner", - "re_tracing", - "re_tuid", - "re_types_core", - "serde", - "static_assertions", - "thiserror 2.0.18", - "typenum", - "uuid", - "web-time", - "xxhash-rust", -] - -[[package]] -name = "re_perf_telemetry" -version = "0.34.0-alpha.1+dev" -dependencies = [ - "ahash", - "anyhow", - "axum", - "base64", - "clap", - "http", - "memory-stats", - "opentelemetry", - "opentelemetry-appender-tracing", - "opentelemetry-http", - "opentelemetry-otlp", - "opentelemetry_sdk", - "parking_lot", - "prometheus-client", - "rand 0.9.4", - "re_auth", - "re_grpc_headers", - "serde", - "serde_json", - "tokio", - "tonic", - "tower", - "tower-http", - "tracing", - "tracing-opentelemetry", - "tracing-subscriber", -] - -[[package]] -name = "re_protos" -version = "0.34.0-alpha.1+dev" -dependencies = [ - "arrow", - "http", - "itertools 0.14.0", - "jiff", - "lz4_flex", - "opentelemetry", - "prost", - "prost-types", - "re_arrow_util", - "re_build_info", - "re_byte_size", - "re_chunk", - "re_grpc_headers", - "re_log_types", - "re_sorbet", - "re_tracing", - "re_tuid", - "re_types_core", - "serde", - "thiserror 2.0.18", - "tonic", - "tonic-prost", - "tower", - "tracing", - "url", -] - -[[package]] -name = "re_query" -version = "0.34.0-alpha.1+dev" -dependencies = [ - "ahash", - "anyhow", - "arrow", - "indent", - "itertools 0.14.0", - "nohash-hasher", - "parking_lot", - "paste", - "re_byte_size", - "re_chunk", - "re_chunk_store", - "re_error", - "re_format", - "re_log", - "re_log_types", - "re_tracing", - "re_types_core", - "seq-macro", - "thiserror 2.0.18", -] - -[[package]] -name = "re_quota_channel" -version = "0.34.0-alpha.1+dev" -dependencies = [ - "crossbeam", - "parking_lot", - "re_byte_size", - "re_format", - "re_log", -] - -[[package]] -name = "re_redap_client" -version = "0.34.0-alpha.1+dev" -dependencies = [ - "ahash", - "arrow", - "ehttp", - "futures", - "itertools 0.14.0", - "jiff", - "opentelemetry", - "re_arrow_util", - "re_auth", - "re_backoff", - "re_byte_size", - "re_chunk", - "re_error", - "re_format", - "re_log", - "re_log_channel", - "re_log_encoding", - "re_log_types", - "re_protos", - "re_tracing", - "re_types_core", - "re_uri", - "serde", - "thiserror 2.0.18", - "tokio", - "tokio-stream", - "tonic", - "tonic-web-wasm-client", - "tower", - "tracing", - "url", - "web-time", -] - -[[package]] -name = "re_rvl" -version = "0.34.0-alpha.1+dev" -dependencies = [ - "byteorder", - "thiserror 2.0.18", -] - -[[package]] -name = "re_sdk_types" -version = "0.34.0-alpha.1+dev" -dependencies = [ - "array-init", - "arrow", - "bytemuck", - "document-features", - "emath", - "half", - "indexmap", - "infer", - "itertools 0.14.0", - "mime_guess2", - "ndarray", - "nohash-hasher", - "ply-rs-bw", - "re_byte_size", - "re_error", - "re_format", - "re_log", - "re_log_types", - "re_rvl", - "re_sorbet", - "re_tracing", - "re_types_core", - "serde", - "smallvec", - "thiserror 2.0.18", - "uuid", -] - -[[package]] -name = "re_sorbet" -version = "0.34.0-alpha.1+dev" -dependencies = [ - "arrow", - "itertools 0.14.0", - "nohash-hasher", - "re_arrow_util", - "re_byte_size", - "re_log", - "re_log_types", - "re_tracing", - "re_tuid", - "re_types_core", - "semver", - "strum 0.26.3", - "thiserror 2.0.18", - "tracing", - "web-time", -] - -[[package]] -name = "re_span" -version = "0.34.0-alpha.1+dev" -dependencies = [ - "num-traits", -] - -[[package]] -name = "re_string_interner" -version = "0.34.0-alpha.1+dev" -dependencies = [ - "ahash", - "nohash-hasher", - "parking_lot", - "re_byte_size", - "serde", - "static_assertions", -] - -[[package]] -name = "re_tracing" -version = "0.34.0-alpha.1+dev" -dependencies = [ - "parking_lot", - "puffin", -] - -[[package]] -name = "re_tuid" -version = "0.34.0-alpha.1+dev" -dependencies = [ - "bytemuck", - "document-features", - "getrandom 0.3.4", - "re_byte_size", - "re_log", - "serde", - "web-time", -] - -[[package]] -name = "re_types_core" -version = "0.34.0-alpha.1+dev" -dependencies = [ - "anyhow", - "arrow", - "bitflags", - "bytemuck", - "document-features", - "half", - "itertools 0.14.0", - "nohash-hasher", - "re_arrow_util", - "re_byte_size", - "re_case", - "re_error", - "re_log", - "re_string_interner", - "re_tracing", - "re_tuid", - "serde", - "thiserror 2.0.18", -] - -[[package]] -name = "re_uri" -version = "0.34.0-alpha.1+dev" -dependencies = [ - "percent-encoding", - "re_byte_size", - "re_log", - "re_log_types", - "re_tuid", - "re_types_core", - "serde", - "static_assertions", - "thiserror 2.0.18", - "url", -] - -[[package]] -name = "recursive" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e" -dependencies = [ - "recursive-proc-macro-impl", - "stacker", -] - -[[package]] -name = "recursive-proc-macro-impl" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" -dependencies = [ - "quote", - "syn", -] - -[[package]] -name = "redox_syscall" -version = "0.5.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" -dependencies = [ - "bitflags", -] - -[[package]] -name = "redox_users" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4e608c6638b9c18977b00b475ac1f28d14e84b27d8d42f70e0bf1e3dec127ac" -dependencies = [ - "getrandom 0.2.17", - "libredox", - "thiserror 2.0.18", -] - -[[package]] -name = "regex" -version = "1.12.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +name = "regex" +version = "1.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" dependencies = [ "aho-corasick", "memchr", @@ -4779,6 +3230,15 @@ dependencies = [ "memchr", ] +[[package]] +name = "repr_offset" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb1070755bd29dffc19d0971cab794e607839ba2ef4b69a9e6fbc8733c1b72ea" +dependencies = [ + "tstr", +] + [[package]] name = "reqwest" version = "0.12.28" @@ -4787,7 +3247,6 @@ checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" dependencies = [ "base64", "bytes", - "futures-channel", "futures-core", "futures-util", "h2", @@ -4820,7 +3279,6 @@ dependencies = [ "wasm-bindgen-futures", "wasm-streams", "web-sys", - "webpki-roots", ] [[package]] @@ -4871,8 +3329,6 @@ version = "0.23.40" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef86cd5876211988985292b91c96a8f2d298df24e75989a43a3c73f2d4d8168b" dependencies = [ - "aws-lc-rs", - "log", "once_cell", "ring", "rustls-pki-types", @@ -4909,7 +3365,6 @@ version = "0.103.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "61c429a8649f110dddef65e2a5ad240f747e85f7758a6bccc7e5777bd33f756e" dependencies = [ - "aws-lc-rs", "ring", "rustls-pki-types", "untrusted", @@ -4936,12 +3391,6 @@ dependencies = [ "winapi-util", ] -[[package]] -name = "saturating_cast" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fc4972f129a0ea378b69fa7c186d63255606e362ad00795f00b869dea5265eb" - [[package]] name = "schannel" version = "0.1.29" @@ -4972,7 +3421,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn", + "syn 2.0.117", ] [[package]] @@ -5057,7 +3506,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -5068,7 +3517,7 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -5084,17 +3533,6 @@ dependencies = [ "zmij", ] -[[package]] -name = "serde_path_to_error" -version = "0.1.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10a9ff822e371bb5403e391ecd83e182e0e77ba7f6fe0160b795797109d1b457" -dependencies = [ - "itoa", - "serde", - "serde_core", -] - [[package]] name = "serde_tokenstream" version = "0.2.3" @@ -5104,7 +3542,7 @@ dependencies = [ "proc-macro2", "quote", "serde", - "syn", + "syn 2.0.117", ] [[package]] @@ -5143,30 +3581,12 @@ dependencies = [ "digest", ] -[[package]] -name = "sharded-slab" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" -dependencies = [ - "lazy_static", -] - [[package]] name = "shlex" version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" -[[package]] -name = "signature" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de" -dependencies = [ - "rand_core 0.6.4", -] - [[package]] name = "simd-adler32" version = "0.3.9" @@ -5232,7 +3652,7 @@ checksum = "a6dd45d8fc1c79299bfbb7190e42ccbbdf6a5f52e4a6ad98d92357ea965bd289" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -5254,46 +3674,18 @@ dependencies = [ "windows-sys 0.61.2", ] -[[package]] -name = "static_assertions" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" - [[package]] name = "strsim" version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" -[[package]] -name = "strum" -version = "0.26.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" -dependencies = [ - "strum_macros 0.26.4", -] - [[package]] name = "strum" version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf" -[[package]] -name = "strum_macros" -version = "0.26.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" -dependencies = [ - "heck", - "proc-macro2", - "quote", - "rustversion", - "syn", -] - [[package]] name = "strum_macros" version = "0.27.2" @@ -5303,7 +3695,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -5327,7 +3719,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "syn", + "syn 2.0.117", "typify", "walkdir", ] @@ -5338,6 +3730,17 @@ version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + [[package]] name = "syn" version = "2.0.117" @@ -5366,7 +3769,7 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -5408,7 +3811,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -5419,16 +3822,7 @@ checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" dependencies = [ "proc-macro2", "quote", - "syn", -] - -[[package]] -name = "thread_local" -version = "1.1.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185" -dependencies = [ - "cfg-if", + "syn 2.0.117", ] [[package]] @@ -5451,18 +3845,6 @@ dependencies = [ "crunchy", ] -[[package]] -name = "tiny_http" -version = "0.12.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "389915df6413a2e74fb181895f933386023c71110878cd0825588928e64cdc82" -dependencies = [ - "ascii", - "chunked_transfer", - "httpdate", - "log", -] - [[package]] name = "tinystr" version = "0.8.3" @@ -5511,7 +3893,7 @@ checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -5561,103 +3943,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "toml_datetime" -version = "1.1.1+spec-1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3165f65f62e28e0115a00b2ebdd37eb6f3b641855f9d636d3cd4103767159ad7" -dependencies = [ - "serde_core", -] - -[[package]] -name = "toml_edit" -version = "0.25.12+spec-1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2153edc6955a6c354fad8f5efd38b6a8769bdccf9fe50f8e1329f81b0baa5d7" -dependencies = [ - "indexmap", - "toml_datetime", - "toml_parser", - "winnow", -] - -[[package]] -name = "toml_parser" -version = "1.1.2+spec-1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2abe9b86193656635d2411dc43050282ca48aa31c2451210f4202550afb7526" -dependencies = [ - "winnow", -] - -[[package]] -name = "tonic" -version = "0.14.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac2a5518c70fa84342385732db33fb3f44bc4cc748936eb5833d2df34d6445ef" -dependencies = [ - "async-trait", - "base64", - "bytes", - "flate2", - "h2", - "http", - "http-body", - "http-body-util", - "hyper", - "hyper-timeout", - "hyper-util", - "percent-encoding", - "pin-project", - "rustls-native-certs", - "socket2", - "sync_wrapper", - "tokio", - "tokio-rustls", - "tokio-stream", - "tower", - "tower-layer", - "tower-service", - "tracing", -] - -[[package]] -name = "tonic-prost" -version = "0.14.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50849f68853be452acf590cde0b146665b8d507b3b8af17261df47e02c209ea0" -dependencies = [ - "bytes", - "prost", - "tonic", -] - -[[package]] -name = "tonic-web-wasm-client" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "898cd44be5e23e59d2956056538f1d6b3c5336629d384ffd2d92e76f87fb98ff" -dependencies = [ - "base64", - "byteorder", - "bytes", - "futures-util", - "http", - "http-body", - "http-body-util", - "httparse", - "js-sys", - "pin-project", - "thiserror 2.0.18", - "tonic", - "tower-service", - "wasm-bindgen", - "wasm-bindgen-futures", - "wasm-streams", - "web-sys", -] - [[package]] name = "tower" version = "0.5.3" @@ -5666,15 +3951,11 @@ checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" dependencies = [ "futures-core", "futures-util", - "indexmap", "pin-project-lite", - "slab", "sync_wrapper", "tokio", - "tokio-util", "tower-layer", "tower-service", - "tracing", ] [[package]] @@ -5692,7 +3973,6 @@ dependencies = [ "tower", "tower-layer", "tower-service", - "tracing", "url", ] @@ -5714,7 +3994,6 @@ version = "0.1.44" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" dependencies = [ - "log", "pin-project-lite", "tracing-attributes", "tracing-core", @@ -5728,7 +4007,7 @@ checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -5738,85 +4017,28 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" dependencies = [ "once_cell", - "valuable", -] - -[[package]] -name = "tracing-log" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" -dependencies = [ - "log", - "once_cell", - "tracing-core", -] - -[[package]] -name = "tracing-opentelemetry" -version = "0.32.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ac28f2d093c6c477eaa76b23525478f38de514fa9aeb1285738d4b97a9552fc" -dependencies = [ - "js-sys", - "opentelemetry", - "smallvec", - "tracing", - "tracing-core", - "tracing-log", - "tracing-subscriber", - "web-time", -] - -[[package]] -name = "tracing-serde" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "704b1aeb7be0d0a84fc9828cae51dab5970fee5088f83d1dd7ee6f6246fc6ff1" -dependencies = [ - "serde", - "tracing-core", ] [[package]] -name = "tracing-subscriber" -version = "0.3.23" +name = "try-lock" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb7f578e5945fb242538965c2d0b04418d38ec25c79d160cd279bf0731c8d319" -dependencies = [ - "matchers", - "nu-ansi-term", - "once_cell", - "regex-automata", - "serde", - "serde_json", - "sharded-slab", - "smallvec", - "thread_local", - "tracing", - "tracing-core", - "tracing-log", - "tracing-serde", -] +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" [[package]] -name = "tracing-web" -version = "0.1.3" +name = "tstr" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9e6a141feebd51f8d91ebfd785af50fca223c570b86852166caa3b141defe7c" +checksum = "7f8e0294f14baae476d0dd0a2d780b2e24d66e349a9de876f5126777a37bdba7" dependencies = [ - "js-sys", - "tracing-core", - "tracing-subscriber", - "wasm-bindgen", - "web-sys", + "tstr_proc_macros", ] [[package]] -name = "try-lock" -version = "0.2.5" +name = "tstr_proc_macros" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" +checksum = "e78122066b0cb818b8afd08f7ed22f7fdbc3e90815035726f0840d0d26c0747a" [[package]] name = "twox-hash" @@ -5824,12 +4046,24 @@ version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c" +[[package]] +name = "typed-arena" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6af6ae20167a9ece4bcb41af5b80f8a1f1df981f6391189ce00fd257af04126a" + [[package]] name = "typenum" version = "1.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40ce102ab67701b8526c123c1bab5cbe42d7040ccfd0f64af1a385808d2f43de" +[[package]] +name = "typewit" +version = "1.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "214ca0b2191785cbc06209b9ca1861e048e39b5ba33574b3cedd58363d5bb5f6" + [[package]] name = "typify" version = "0.5.0" @@ -5855,7 +4089,7 @@ dependencies = [ "semver", "serde", "serde_json", - "syn", + "syn 2.0.117", "thiserror 2.0.18", "unicode-ident", ] @@ -5873,16 +4107,10 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn", + "syn 2.0.117", "typify-impl", ] -[[package]] -name = "unicase" -version = "2.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142" - [[package]] name = "unicode-ident" version = "1.0.24" @@ -5919,35 +4147,6 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" -[[package]] -name = "ureq" -version = "3.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dea7109cdcd5864d4eeb1b58a1648dc9bf520360d7af16ec26d0a9354bafcfc0" -dependencies = [ - "base64", - "flate2", - "log", - "percent-encoding", - "rustls", - "rustls-pki-types", - "ureq-proto", - "utf8-zero", - "webpki-roots", -] - -[[package]] -name = "ureq-proto" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e994ba84b0bd1b1b0cf92878b7ef898a5c1760108fe7b6010327e274917a808c" -dependencies = [ - "base64", - "http", - "httparse", - "log", -] - [[package]] name = "url" version = "2.5.8" @@ -5958,27 +4157,14 @@ dependencies = [ "idna", "percent-encoding", "serde", - "serde_derive", ] -[[package]] -name = "utf8-zero" -version = "0.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8c0a043c9540bae7c578c88f91dda8bd82e59ae27c21baca69c8b191aaf5a6e" - [[package]] name = "utf8_iter" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" -[[package]] -name = "utf8parse" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" - [[package]] name = "uuid" version = "1.23.1" @@ -5991,22 +4177,6 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "valuable" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" - -[[package]] -name = "vec1" -version = "1.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eab68b56840f69efb0fefbe3ab6661499217ffdc58e2eef7c3f6f69835386322" -dependencies = [ - "serde", - "smallvec", -] - [[package]] name = "version_check" version = "0.9.5" @@ -6098,7 +4268,7 @@ dependencies = [ "bumpalo", "proc-macro2", "quote", - "syn", + "syn 2.0.117", "wasm-bindgen-shared", ] @@ -6178,15 +4348,6 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "webpki-roots" -version = "1.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52f5ee44c96cf55f1b349600768e3ece3a8f26010c05265ab73f945bb1a2eb9d" -dependencies = [ - "rustls-pki-types", -] - [[package]] name = "winapi" version = "0.3.9" @@ -6239,7 +4400,7 @@ checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -6250,7 +4411,7 @@ checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -6499,15 +4660,6 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" -[[package]] -name = "winnow" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0592e1c9d151f854e6fd382574c3a0855250e1d9b2f99d9281c6e6391af352f1" -dependencies = [ - "memchr", -] - [[package]] name = "wit-bindgen" version = "0.51.0" @@ -6544,7 +4696,7 @@ dependencies = [ "heck", "indexmap", "prettyplease", - "syn", + "syn 2.0.117", "wasm-metadata", "wit-bindgen-core", "wit-component", @@ -6560,7 +4712,7 @@ dependencies = [ "prettyplease", "proc-macro2", "quote", - "syn", + "syn 2.0.117", "wit-bindgen-core", "wit-bindgen-rust", ] @@ -6608,12 +4760,6 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" -[[package]] -name = "xxhash-rust" -version = "0.8.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" - [[package]] name = "yoke" version = "0.8.2" @@ -6633,7 +4779,7 @@ checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", "synstructure", ] @@ -6654,7 +4800,7 @@ checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -6674,7 +4820,7 @@ checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", "synstructure", ] @@ -6683,20 +4829,6 @@ name = "zeroize" version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" -dependencies = [ - "zeroize_derive", -] - -[[package]] -name = "zeroize_derive" -version = "1.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85a5b4158499876c763cb03bc4e49185d3cccbabb15b33c627f7884f43db852e" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] [[package]] name = "zerotrie" @@ -6728,7 +4860,7 @@ checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] diff --git a/native/Cargo.toml b/native/Cargo.toml index 9ca2a39..b7e96af 100644 --- a/native/Cargo.toml +++ b/native/Cargo.toml @@ -72,6 +72,7 @@ runtime-metrics = ["dep:tokio-metrics"] arrow = { version = "58", features = ["ffi"] } async-trait = "0.1" datafusion = { version = "53.1.0", features = ["avro"] } +datafusion-ffi = "53.1.0" datafusion-proto = "53.1.0" datafusion-substrait = { version = "53.1.0", optional = true } futures = "0.3" @@ -82,20 +83,6 @@ object_store = { version = "0.13", default-features = false } prost = "0.14" tokio = { version = "1", features = ["rt-multi-thread"] } -# Rerun provider crates. Linked directly into the cdylib so the embedded -# DataFusion `SessionContext` can register the `TableProvider` via the -# native `ctx.register_table(...)` path — no `datafusion-ffi` boundary. -re_datafusion = { path = "../../reality/rerun/crates/store/re_datafusion" } -re_redap_client = { path = "../../reality/rerun/crates/store/re_redap_client" } -re_dataframe = { path = "../../reality/rerun/crates/store/re_dataframe" } -re_log_types = { path = "../../reality/rerun/crates/store/re_log_types" } -re_protos = { path = "../../reality/rerun/crates/store/re_protos" } -re_types_core = { path = "../../reality/rerun/crates/store/re_types_core" } -re_uri = { path = "../../reality/rerun/crates/store/re_uri" } -re_auth = { path = "../../reality/rerun/crates/utils/re_auth" } -# Direct rustls dep so we can install the default crypto provider once at -# startup — required by the rerun TLS stack (see rerun_py catalog_client.rs). -rustls = "0.23" # Tokio runtime metrics. Optional + cfg-gated: this crate's API surface lives # behind `--cfg tokio_unstable`, so enabling the `runtime-metrics` feature also # requires the caller to set `RUSTFLAGS="--cfg tokio_unstable"` at build time. diff --git a/native/build.rs b/native/build.rs index dd3918e..d292514 100644 --- a/native/build.rs +++ b/native/build.rs @@ -28,7 +28,6 @@ fn main() { "../proto/json_write_options.proto", "../proto/object_store_options.proto", "../proto/parquet_read_options.proto", - "../proto/rerun_table_options.proto", ]; for p in PROTOS { println!("cargo:rerun-if-changed={p}"); diff --git a/native/src/ffi_table_provider.rs b/native/src/ffi_table_provider.rs new file mode 100644 index 0000000..f055263 --- /dev/null +++ b/native/src/ffi_table_provider.rs @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Generic FFI-bridged TableProvider registration. +//! +//! Accepts a raw `FFI_TableProvider` pointer produced elsewhere — by another +//! cdylib (cross-binary boundary; transparently wrapped via +//! `ForeignTableProvider`) or by Rust code in this same crate (same-binary; +//! library marker lets the impl unwrap to the original Arc). +//! +//! Ownership: the caller's `Box::into_raw(Box::new(FFI_TableProvider))` +//! pointer is consumed here. After this call the pointer must not be reused. + +use std::sync::Arc; + +use datafusion::catalog::TableProvider; +use datafusion::prelude::SessionContext; +use datafusion_ffi::table_provider::FFI_TableProvider; +use jni::objects::{JClass, JString}; +use jni::sys::jlong; +use jni::JNIEnv; + +use crate::errors::{try_unwrap_or_throw, JniResult}; + +#[no_mangle] +pub extern "system" fn Java_org_apache_datafusion_SessionContext_registerFfiTableNative<'local>( + mut env: JNIEnv<'local>, + _class: JClass<'local>, + handle: jlong, + name: JString<'local>, + ffi_ptr: jlong, +) { + try_unwrap_or_throw(&mut env, (), |env| -> JniResult<()> { + if handle == 0 { + return Err("SessionContext handle is null".into()); + } + if ffi_ptr == 0 { + return Err("registerFfiTable: FFI_TableProvider pointer is null".into()); + } + // SAFETY: matches the existing `registerTableNative` pattern — handle + // came from `createSessionContext` as `Box` raw ptr. + let ctx = unsafe { &*(handle as *const SessionContext) }; + let name: String = env.get_string(&name)?.into(); + + // Take ownership of the producer's FFI_TableProvider, materialise an + // Arc on this side (cross-cdylib hop returns a + // ForeignTableProvider wrapper; same-cdylib hop returns the original + // Arc thanks to LIBRARY_MARKER dispatch in datafusion-ffi), then drop + // the Box — the Arc clone now retains ownership. + let ffi = unsafe { Box::from_raw(ffi_ptr as *mut FFI_TableProvider) }; + let provider: Arc = (&*ffi).into(); + drop(ffi); + + ctx.register_table(name.as_str(), provider)?; + Ok(()) + }) +} diff --git a/native/src/lib.rs b/native/src/lib.rs index d2745aa..1777f19 100644 --- a/native/src/lib.rs +++ b/native/src/lib.rs @@ -20,12 +20,12 @@ mod avro; mod cache_manager; mod csv; mod errors; +mod ffi_table_provider; mod jni_util; mod json; mod memory; mod object_store; mod proto; -mod rerun_provider; mod runtime_metrics; mod schema; mod table_provider; diff --git a/native/src/rerun_provider.rs b/native/src/rerun_provider.rs deleted file mode 100644 index bce3454..0000000 --- a/native/src/rerun_provider.rs +++ /dev/null @@ -1,318 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Rerun `TableProvider` registration and segment enumeration JNI surface. -//! -//! Two JNI entry points used by the Spark connector. Scan + filter + projection -//! reuse the existing `createDataFrameFromProto` path (the JVM side encodes a -//! `LogicalPlanNode` referencing the registered table) so no new scan JNI is -//! introduced here. -//! -//! - `registerRerunTableNative`: decode a [`RerunTableOptions`] envelope, -//! construct a [`DataframeQueryTableProvider`] (does schema discovery + sets -//! up the gRPC connection), and register it on the embedded -//! [`SessionContext`] under the given name. -//! - `listRerunSegmentsNative`: enumerate segment ids for the dataset, used by -//! the Spark driver to plan one input partition per segment. - -use std::sync::{Arc, Once}; - -use datafusion::arrow::datatypes::{DataType, Schema as ArrowSchema, TimeUnit}; -use datafusion::catalog::TableProvider; -use datafusion::prelude::SessionContext; -use jni::objects::{JByteArray, JClass, JString}; -use jni::sys::{jlong, jobjectArray}; -use jni::JNIEnv; -use prost::Message; - -use re_datafusion::DataframeQueryTableProvider; -use re_dataframe::QueryExpression; -use re_log_types::EntryId; -use re_protos::cloud::v1alpha1::EntryFilter; -use re_redap_client::{ConnectionClient, ConnectionRegistry, ConnectionRegistryHandle, Credentials}; -use re_types_core::TimelineName; - -use crate::errors::{try_unwrap_or_throw, JniResult}; -use crate::proto_gen::RerunTableOptions; -use crate::runtime; - -/// Idempotent install of rustls's `ring` crypto provider. The rerun TLS stack -/// crashes at runtime if no default provider is installed; this used to be -/// done implicitly by `object_store` but rerun no longer pulls that in, so the -/// JNI bridge installs it explicitly on first use. -fn init_rustls_crypto() { - static ONCE: Once = Once::new(); - ONCE.call_once(|| { - // `install_default` returns Err when a provider has already been - // installed by another path; we don't care which one wins. - let _ = rustls::crypto::ring::default_provider().install_default(); - }); -} - -fn build_registry_handle( - origin: &re_uri::Origin, - token: &str, -) -> JniResult { - let handle = ConnectionRegistry::new_with_stored_credentials(); - let credentials = if token.is_empty() { - Credentials::Stored - } else { - let jwt = re_auth::Jwt::try_from(token.to_owned())?; - Credentials::Token(jwt) - }; - handle.set_credentials(origin, credentials); - Ok(handle) -} - -async fn resolve_entry_id( - handle: &ConnectionRegistryHandle, - origin: &re_uri::Origin, - options: &RerunTableOptions, -) -> JniResult { - if !options.dataset_id.is_empty() { - let id: EntryId = options - .dataset_id - .parse() - .map_err(|e: std::num::ParseIntError| -> Box { - format!("invalid Rerun dataset_id {:?}: {}", options.dataset_id, e).into() - })?; - return Ok(id); - } - if options.dataset_name.is_empty() { - return Err("RerunTableOptions: one of `dataset_id` or `dataset_name` must be set".into()); - } - let mut client = handle.client(origin.clone()).await?; - let entries = client - .find_entries(EntryFilter::new().with_name(options.dataset_name.clone())) - .await?; - let entry = entries.into_iter().next().ok_or_else( - || -> Box { - format!("no Rerun entry found with name {:?}", options.dataset_name).into() - }, - )?; - Ok(entry.id) -} - -fn build_query_expression(options: &RerunTableOptions) -> QueryExpression { - let mut qe = QueryExpression::default(); - if !options.index.is_empty() { - qe.filtered_index = Some(TimelineName::new(options.index.as_str())); - } - qe -} - -async fn build_provider( - options: RerunTableOptions, -) -> JniResult> { - init_rustls_crypto(); - let origin: re_uri::Origin = options - .url - .parse() - .map_err(|e: re_uri::Error| -> Box { - format!("invalid Rerun url {:?}: {}", options.url, e).into() - })?; - let handle = build_registry_handle(&origin, options.token.as_str())?; - let entry_id = resolve_entry_id(&handle, &origin, &options).await?; - let query_expr = build_query_expression(&options); - - let provider = DataframeQueryTableProvider::::new( - origin, - handle, - entry_id, - &query_expr, - options.segments.as_slice(), - None, - None, - None, - Vec::new(), - ) - .await?; - Ok(Arc::new(provider)) -} - -#[no_mangle] -pub extern "system" fn Java_org_apache_datafusion_SessionContext_registerRerunTableNative<'local>( - mut env: JNIEnv<'local>, - _class: JClass<'local>, - handle: jlong, - name: JString<'local>, - options_proto: JByteArray<'local>, -) { - try_unwrap_or_throw(&mut env, (), |env| -> JniResult<()> { - if handle == 0 { - return Err("SessionContext handle is null".into()); - } - // SAFETY: matches the existing `registerTableNative` pattern — handle - // came from `createSessionContext` as `Box` raw ptr. - let ctx = unsafe { &*(handle as *const SessionContext) }; - let name: String = env.get_string(&name)?.into(); - let bytes: Vec = env.convert_byte_array(&options_proto)?; - let options = RerunTableOptions::decode(bytes.as_slice())?; - - let provider = runtime().block_on(build_provider(options))?; - runtime().block_on(register_with_widening_view(ctx, name.as_str(), provider))?; - Ok(()) - }) -} - -/// Recursively compute the arrow_cast destination-type string for a column -/// whose Arrow type is not directly readable by Spark's `ArrowColumnVector`. -/// -/// Spark 3.5's `ArrowColumnVector` has no accessor for unsigned ints, Time, -/// or Float16. We use DataFusion's built-in `arrow_cast(col, '')` -/// (rather than SQL `CAST`) because it preserves nested structure — a -/// `List` becomes `List(Int32)` end-to-end, not just at the top -/// level. Returns `Some(target)` if any widening is needed, `None` if the -/// column passes through unchanged. -/// -/// Coverage: -/// - scalars: UInt8/16/32/64, Float16, Time32/64 -/// - List<...>, LargeList<...>, FixedSizeList<..., size> with a widenable -/// element type (handles the `item` field rejection at any nesting depth) -/// -/// NOT covered in v1: Struct<...> / Map<...> with widenable children. The -/// JVM schema converter still rejects those with the original error. -fn arrow_cast_widening(dt: &DataType) -> Option { - match dt { - DataType::UInt8 => Some("Int16".into()), - DataType::UInt16 => Some("Int32".into()), - DataType::UInt32 => Some("Int64".into()), - // UInt64 widening is lossy for values ≥ 2^63 — documented limitation. - DataType::UInt64 => Some("Int64".into()), - DataType::Float16 => Some("Float32".into()), - DataType::Time32(_) => Some("Int32".into()), - DataType::Time64(_) => Some("Int64".into()), - // Spark's ArrowColumnVector accepts only Timestamp(Microsecond, ...). - // Other units cause `UNSUPPORTED_ARROWTYPE` at executor batch wrap. - // Cast all timestamps to microsecond precision, preserving the - // timezone string (None vs Some(tz)). - DataType::Timestamp(unit, tz) => { - if *unit == TimeUnit::Microsecond { - None - } else { - let tz_str = match tz { - None => "None".to_string(), - Some(s) => format!("Some(\"{}\")", s.replace('\\', "\\\\").replace('"', "\\\"")), - }; - Some(format!("Timestamp(Microsecond, {tz_str})")) - } - } - DataType::List(field) => { - arrow_cast_widening(field.data_type()).map(|t| format!("List({t})")) - } - DataType::LargeList(field) => { - arrow_cast_widening(field.data_type()).map(|t| format!("LargeList({t})")) - } - DataType::FixedSizeList(field, size) => arrow_cast_widening(field.data_type()) - .map(|t| format!("FixedSizeList({t}, {size})")), - _ => None, - } -} - -/// Register `provider` under `external_name`. If the provider's schema has any -/// fields that need a Spark-compatibility widen (see [`widen_sql_type`]), the -/// raw provider is stashed under a mangled name and `external_name` is -/// registered as a SQL view that casts the offending columns. v1 widens -/// top-level fields only — nested unsigned ints inside a Struct still surface -/// the original Arrow type and will fail in the JVM schema converter. -async fn register_with_widening_view( - ctx: &SessionContext, - external_name: &str, - provider: Arc, -) -> JniResult<()> { - let schema: Arc = provider.schema(); - let needs_view = schema - .fields() - .iter() - .any(|f| arrow_cast_widening(f.data_type()).is_some()); - - if !needs_view { - ctx.register_table(external_name, provider)?; - return Ok(()); - } - - let raw_name = format!("__rerun_raw__{external_name}"); - ctx.register_table(raw_name.as_str(), provider)?; - - let select_list = schema - .fields() - .iter() - .map(|f| { - let name = f.name(); - // Identifier quoting: double quotes; escape any embedded "". - let quoted = format!("\"{}\"", name.replace('"', "\"\"")); - match arrow_cast_widening(f.data_type()) { - // arrow_cast preserves nested structure (List → - // List(Int32)); SQL CAST would have to be List-of-scalar - // only and produce a different operator graph. - Some(target) => format!("arrow_cast({quoted}, '{target}') AS {quoted}"), - None => quoted, - } - }) - .collect::>() - .join(", "); - let sql = format!( - "SELECT {select_list} FROM \"{}\"", - raw_name.replace('"', "\"\"") - ); - - let df = ctx.sql(&sql).await?; - ctx.register_table(external_name, df.into_view())?; - Ok(()) -} - -#[no_mangle] -pub extern "system" fn Java_org_apache_datafusion_SessionContext_listRerunSegmentsNative<'local>( - mut env: JNIEnv<'local>, - _class: JClass<'local>, - options_proto: JByteArray<'local>, -) -> jobjectArray { - try_unwrap_or_throw( - &mut env, - std::ptr::null_mut(), - |env| -> JniResult { - let bytes: Vec = env.convert_byte_array(&options_proto)?; - let options = RerunTableOptions::decode(bytes.as_slice())?; - - init_rustls_crypto(); - let origin: re_uri::Origin = options.url.parse().map_err( - |e: re_uri::Error| -> Box { - format!("invalid Rerun url {:?}: {}", options.url, e).into() - }, - )?; - let handle = build_registry_handle(&origin, options.token.as_str())?; - - let segments: Vec = runtime().block_on(async { - let entry_id = resolve_entry_id(&handle, &origin, &options).await?; - let mut client = handle.client(origin.clone()).await?; - let raw = client.get_dataset_segment_ids(entry_id).await?; - Ok::, Box>( - raw.into_iter().map(|s| s.into_inner()).collect(), - ) - })?; - - let string_class = env.find_class("java/lang/String")?; - let empty = env.new_string("")?; - let arr = env.new_object_array(segments.len() as i32, &string_class, &empty)?; - for (i, s) in segments.iter().enumerate() { - let js = env.new_string(s)?; - env.set_object_array_element(&arr, i as i32, js)?; - } - Ok(arr.into_raw()) - }, - ) -} diff --git a/proto/rerun_table_options.proto b/proto/rerun_table_options.proto deleted file mode 100644 index c5e8ee6..0000000 --- a/proto/rerun_table_options.proto +++ /dev/null @@ -1,52 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -syntax = "proto3"; - -package datafusion_java; - -option java_package = "org.apache.datafusion.protobuf"; -option java_multiple_files = true; - -// Setup parameters for constructing a Rerun `TableProvider` and registering -// it into the embedded DataFusion `SessionContext`. These mirror the -// "Class 1" (author-set, not query-derived) parameters from the Spark -// connector design: every executor task receives the same envelope with -// `segments` narrowed to that task's single segment. -message RerunTableOptions { - // Rerun gRPC endpoint, e.g. "rerun+http://localhost:51234". - string url = 1; - - // EXACTLY ONE of `dataset_name` or `dataset_id` must be set. - // - `dataset_name`: resolved server-side via FindEntries(name=...). - // - `dataset_id`: parsed directly as a Tuid (skips the FindEntries RPC). - string dataset_name = 2; - string dataset_id = 3; - - // Optional explicit segment list. Empty → provider serves all segments - // (the driver-side `listRerunSegments` JNI is used by the Spark - // connector to enumerate before partitioning). - repeated string segments = 4; - - // Optional timeline name to set as `QueryExpression.filtered_index`. - // Empty → static-only query. - string index = 5; - - // Optional auth token (Bearer JWT). Empty → `Credentials::Stored` - // (looks up stored creds for this origin; falls back to REDAP_TOKEN). - string token = 6; -} From e7b58484d32564502ed98d402836dd59c61023fb Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 10 Jun 2026 14:31:26 +0200 Subject: [PATCH 03/22] feat(spark): add Spark DataSource V2 connector + pyspark FFI demo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Vendors the generic connector-core from rerun-io/rerun-spark-connector into a new `spark/` Maven module (Apache 2.0 headers, Scala 2.13), wiring it against this repo's `SessionContext.registerFfiTable`. Adds an `ExampleFfiProviderFactory` against the existing example MemTable cdylib and a self-contained pyspark demo that exercises schema inference, scan, projection, and predicate pushdown across the FFI boundary. - spark/: DSv2 plumbing (DatafusionSource/Table/Scan/Reader), Arrow → Spark schema converter, Spark V2 Predicate → DataFusion LogicalExprNode translator, and a widening cdylib (`libdatafusion_spark_helper`) wrapping Spark-incompatible Arrow types via `arrow::compute::cast` before scans. - spark/.../META-INF/services/...DataSourceRegister: registers the `datafusion` short name so `spark.read.format("datafusion")` resolves. - examples/ExampleFfiProviderFactory: minimal FfiProviderFactory delegating to FfiTableProviderExampleNative.createMemTableProvider(). - examples/python/: pyspark demo + README documenting the uv venv, side-loaded Scala 2.13 Spark distro, `-Dmaven.repo.local` flow, and the `extraClassPath` Arrow 19 / flatbuffers 25 / protobuf 3.25 overrides needed because Spark 3.5.7 ships older versions. - pom.xml: add `spark` module, allow `**/*.md` and `**/Cargo.lock` through RAT, exclude `**/META-INF/services/**` (line-delimited SPI files). - DatafusionSource.scala / DatafusionColumnarPartitionReader.scala: drop reflective `registerFfiTable` calls (carried over from the upstream pre-merge state) for direct invocation. Co-Authored-By: Claude Opus 4.7 (1M context) --- examples/pom.xml | 9 + examples/python/README.md | 134 + examples/python/ffi_table_provider_demo.py | 175 + .../examples/ExampleFfiProviderFactory.java | 63 + pom.xml | 7 +- spark/native/Cargo.lock | 3655 +++++++++++++++++ spark/native/Cargo.toml | 29 + spark/native/src/lib.rs | 128 + spark/native/src/widening.rs | 335 ++ spark/pom.xml | 254 ++ .../io/datafusion/spark/FfiHelperNative.java | 52 + .../datafusion/spark/FfiProviderFactory.java | 66 + .../datafusion/spark/NativeLibraryLoader.java | 77 + ...pache.spark.sql.sources.DataSourceRegister | 1 + .../datafusion/spark/ArrowToSparkSchema.scala | 151 + .../io/datafusion/spark/DatafusionBatch.scala | 63 + .../DatafusionColumnarPartitionReader.scala | 120 + .../spark/DatafusionInputPartition.scala | 42 + .../DatafusionPartitionReaderFactory.scala | 45 + .../io/datafusion/spark/DatafusionScan.scala | 47 + .../spark/DatafusionScanBuilder.scala | 76 + .../datafusion/spark/DatafusionSource.scala | 97 + .../io/datafusion/spark/DatafusionTable.scala | 51 + .../spark/NonClosingArrowColumnVector.scala | 33 + .../spark/SparkPredicateTranslator.scala | 214 + .../spark/ArrowToSparkSchemaTest.scala | 106 + .../spark/SparkPredicateTranslatorTest.scala | 87 + 27 files changed, 6116 insertions(+), 1 deletion(-) create mode 100644 examples/python/README.md create mode 100644 examples/python/ffi_table_provider_demo.py create mode 100644 examples/src/main/java/org/apache/datafusion/examples/ExampleFfiProviderFactory.java create mode 100644 spark/native/Cargo.lock create mode 100644 spark/native/Cargo.toml create mode 100644 spark/native/src/lib.rs create mode 100644 spark/native/src/widening.rs create mode 100644 spark/pom.xml create mode 100644 spark/src/main/java/io/datafusion/spark/FfiHelperNative.java create mode 100644 spark/src/main/java/io/datafusion/spark/FfiProviderFactory.java create mode 100644 spark/src/main/java/io/datafusion/spark/NativeLibraryLoader.java create mode 100644 spark/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister create mode 100644 spark/src/main/scala/io/datafusion/spark/ArrowToSparkSchema.scala create mode 100644 spark/src/main/scala/io/datafusion/spark/DatafusionBatch.scala create mode 100644 spark/src/main/scala/io/datafusion/spark/DatafusionColumnarPartitionReader.scala create mode 100644 spark/src/main/scala/io/datafusion/spark/DatafusionInputPartition.scala create mode 100644 spark/src/main/scala/io/datafusion/spark/DatafusionPartitionReaderFactory.scala create mode 100644 spark/src/main/scala/io/datafusion/spark/DatafusionScan.scala create mode 100644 spark/src/main/scala/io/datafusion/spark/DatafusionScanBuilder.scala create mode 100644 spark/src/main/scala/io/datafusion/spark/DatafusionSource.scala create mode 100644 spark/src/main/scala/io/datafusion/spark/DatafusionTable.scala create mode 100644 spark/src/main/scala/io/datafusion/spark/NonClosingArrowColumnVector.scala create mode 100644 spark/src/main/scala/io/datafusion/spark/SparkPredicateTranslator.scala create mode 100644 spark/src/test/scala/io/datafusion/spark/ArrowToSparkSchemaTest.scala create mode 100644 spark/src/test/scala/io/datafusion/spark/SparkPredicateTranslatorTest.scala diff --git a/examples/pom.xml b/examples/pom.xml index 78fcc5c..6220afe 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -44,6 +44,15 @@ under the License. org.apache.datafusion datafusion-java + + + org.apache.datafusion + datafusion-java-spark_2.13 + ${project.version} + provided + org.apache.arrow arrow-vector diff --git a/examples/python/README.md b/examples/python/README.md new file mode 100644 index 0000000..edf7f3f --- /dev/null +++ b/examples/python/README.md @@ -0,0 +1,134 @@ +# PySpark end-to-end demo + +`ffi_table_provider_demo.py` proves the full DataFusion → Spark path: + +``` +examples/native (cdylib) <-- in-memory MemTable + | jlong (FFI_TableProvider*) + v +ExampleFfiProviderFactory <-- implements FfiProviderFactory + | Class.forName(...) + v +datafusion-java-spark <-- DSv2 plumbing, widening, predicate xlate + | spark.read.format("datafusion") + v +PySpark DataFrame <-- printSchema / show / filter / select +``` + +## Prerequisites + +1. **Java 17.** `JAVA_HOME` must point at a JDK 17 install. + +2. **Three cdylibs** built from this repo: + + ```bash + cd native && cargo build --release && cd .. + cd examples/native && cargo build --release && cd ../.. + cd spark/native && cargo build --release && cd ../.. + ``` + +3. **Maven artifacts installed into a side-loaded local repository.** + + The script reads `arrow-c-data`, `flatbuffers-java`, and `protobuf-java` + jars from `${DATAFUSION_DEMO_M2:-/tmp/m2-datafusion}` (Spark's bundled + versions are too old, so the demo prepends our copies on + `spark.driver/executor.extraClassPath`). Tell Maven to install there: + + ```bash + mvn install -DskipTests \ + -Ddatafusion.native.profile=release \ + -Dmaven.repo.local=/tmp/m2-datafusion + ``` + + If you already use `~/.m2`, point `DATAFUSION_DEMO_M2` at it instead and + skip `-Dmaven.repo.local`. + +4. **A Scala 2.13 Spark distribution.** The PyPI `pyspark` wheel embeds + Scala 2.12 jars; the connector is compiled against 2.13, so we override + `SPARK_HOME` before importing pyspark. Download once: + + ```bash + cd /tmp + curl -L -o spark-2.13.tgz \ + https://archive.apache.org/dist/spark/spark-3.5.7/spark-3.5.7-bin-hadoop3-scala2.13.tgz + tar xzf spark-2.13.tgz + ``` + + The script defaults `SPARK_HOME` to + `/tmp/spark-3.5.7-bin-hadoop3-scala2.13`; set the env var if you put it + elsewhere. + +5. **A self-contained Python venv with `pyspark==3.5.7`** (uv keeps it + isolated from system site-packages): + + ```bash + cd examples/python + uv venv --python 3.11 .venv + uv pip install --python .venv/bin/python "pyspark==3.5.7" + cd ../.. + ``` + +## Run + +```bash +examples/python/.venv/bin/python examples/python/ffi_table_provider_demo.py +``` + +Expected output: + +``` +=== schema === +root + |-- id: long (nullable = false) + |-- name: string (nullable = true) + |-- value: double (nullable = true) + +=== full scan === ++---+-----+-----+ +|id |name |value| ++---+-----+-----+ +|1 |alice|1.5 | +|2 |bob |2.5 | +|3 |NULL |3.5 | +|4 |dave |NULL | ++---+-----+-----+ + +=== filter pushdown: value > 2.0 === ++---+----+-----+ +|id |name|value| ++---+----+-----+ +|2 |bob |2.5 | +|3 |NULL|3.5 | ++---+----+-----+ + +=== projection: id, name === ++---+-----+ +|id |name | ++---+-----+ +|1 |alice| +|2 |bob | +|3 |NULL | +|4 |dave | ++---+-----+ +``` + +Filter row count drops from 4 → 2 because the predicate is pushed across the +FFI boundary as a `LogicalExprNode` proto and applied inside DataFusion before +Arrow batches cross back to Spark. + +## Notes + +- `master("local[2]")` keeps driver + executor in one JVM so the example + cdylib loads once. Cluster mode would need the cdylib pre-staged on every + worker (the widening lib is bundled in `datafusion-java-spark`; only the + per-bridge example lib is not). +- `extraClassPath` (not `--packages` / `userClassPathFirst`) is used because + the Spark distro ships Arrow 12, flatbuffers 1.12, and protobuf 2.5, all + of which we need to override; userClassPathFirst splits Netty across two + class loaders and the `arrow-memory-netty-buffer-patch` shim breaks. +- The `datafusion` format short name resolves via the SPI file in + `spark/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister`. + You can also use the FQCN: `format("io.datafusion.spark.DatafusionSource")`. +- To swap in your own bridge, write a `FfiProviderFactory` against your own + cdylib (mirroring `ExampleFfiProviderFactory`) and pass its FQCN via + `option("df.factory", ...)`. diff --git a/examples/python/ffi_table_provider_demo.py b/examples/python/ffi_table_provider_demo.py new file mode 100644 index 0000000..64128ea --- /dev/null +++ b/examples/python/ffi_table_provider_demo.py @@ -0,0 +1,175 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +"""End-to-end PySpark demo of the DataFusion FFI table provider. + +Wires the in-memory example MemTable produced by ``examples/native`` into a +Spark DataSource V2 scan through the generic connector in ``spark/``. + +Prerequisites (run from the repo root): + + cd native && cargo build --release && cd .. + cd examples/native && cargo build --release && cd ../.. + cd spark/native && cargo build --release && cd ../.. + mvn install -Ddatafusion.native.profile=release -DskipTests + +Run: + + python3 examples/python/ffi_table_provider_demo.py +""" + +import glob +import os +import sys +from pathlib import Path + +# The PyPI ``pyspark`` wheel embeds a Scala 2.12 Spark distribution; this +# connector is compiled against Scala 2.13. Override SPARK_HOME (before the +# pyspark import so the wheel honours it) to a side-loaded 2.13 distribution. +_SPARK_HOME_2_13 = os.environ.get( + "SPARK_HOME", + "/tmp/spark-3.5.7-bin-hadoop3-scala2.13", +) +if not Path(_SPARK_HOME_2_13, "jars", "scala-library-2.13.8.jar").exists(): + sys.exit( + f"missing Scala 2.13 Spark distribution at {_SPARK_HOME_2_13}. " + "Download from https://archive.apache.org/dist/spark/spark-3.5.7/" + "spark-3.5.7-bin-hadoop3-scala2.13.tgz and extract to that path " + "(or set SPARK_HOME to your own 2.13 distro)." + ) +os.environ["SPARK_HOME"] = _SPARK_HOME_2_13 + +from pyspark.sql import SparkSession + + +REPO_ROOT = Path(__file__).resolve().parents[2] +VERSION = "0.2.0-SNAPSHOT" +ARROW_VERSION = "19.0.0" +FLATBUFFERS_VERSION = "25.2.10" +PROTOBUF_VERSION = "3.25.5" +# Local maven repository populated by ``mvn install -Dmaven.repo.local=...``. +M2_REPO = Path(os.environ.get("DATAFUSION_DEMO_M2", "/tmp/m2-datafusion")) + + +def _resolve_jar(module: str, artifact: str) -> str: + candidates = glob.glob(str(REPO_ROOT / module / "target" / f"{artifact}-{VERSION}.jar")) + if not candidates: + sys.exit( + f"missing jar for {artifact} under {module}/target/. " + f"Run 'mvn install -DskipTests' from {REPO_ROOT} first." + ) + return candidates[0] + + +def _m2_jar(group_path: str, artifact: str, version: str) -> str: + path = M2_REPO / group_path / artifact / version / f"{artifact}-{version}.jar" + if not path.exists(): + sys.exit( + f"missing dependency jar {path}. " + f"Re-run 'mvn install -DskipTests -Dmaven.repo.local={M2_REPO}'." + ) + return str(path) + + +def main() -> None: + # Spark 3.5.7 bundles Arrow 12.0.1; datafusion-java is compiled against + # Arrow 19, which needs ArrowArrayStream (added after 12) and a much newer + # flatbuffers runtime. Ship our copies on spark.jars and force userClassPathFirst + # so they win over the bundled jars on both driver and executor. + arrow_jars = [ + _m2_jar("org/apache/arrow", "arrow-format", ARROW_VERSION), + _m2_jar("org/apache/arrow", "arrow-vector", ARROW_VERSION), + _m2_jar("org/apache/arrow", "arrow-memory-core", ARROW_VERSION), + _m2_jar("org/apache/arrow", "arrow-memory-netty", ARROW_VERSION), + _m2_jar( + "org/apache/arrow", + "arrow-memory-netty-buffer-patch", + ARROW_VERSION, + ), + _m2_jar("org/apache/arrow", "arrow-c-data", ARROW_VERSION), + _m2_jar( + "com/google/flatbuffers", "flatbuffers-java", FLATBUFFERS_VERSION + ), + # Spark ships protobuf-java 2.5.0 (sans MessageOrBuilder). The proto + # surface in core (LogicalExprNode etc.) needs 3.25.x. + _m2_jar("com/google/protobuf", "protobuf-java", PROTOBUF_VERSION), + ] + app_jars = [ + _resolve_jar("core", "datafusion-java"), + _resolve_jar("spark", "datafusion-java-spark_2.13"), + _resolve_jar("examples", "datafusion-java-examples"), + *arrow_jars, + ] + jars = ",".join(app_jars) + # Prepend the same jars onto the bootstrap classpath so Arrow 19's classes + # are loaded by the system class loader — avoids the + # ``UnsafeDirectLittleEndian cannot access superclass WrappedByteBuf`` + # IllegalAccessError that ChildFirstURLClassLoader produces when the + # buffer-patch class lands in the child loader while Netty stays in the app + # loader. + extra_classpath = ":".join(app_jars) + + spark = ( + SparkSession.builder.appName("datafusion-ffi-demo") + .master("local[2]") + .config("spark.jars", jars) + .config("spark.driver.extraClassPath", extra_classpath) + .config("spark.executor.extraClassPath", extra_classpath) + .config( + "spark.driver.extraJavaOptions", + "--add-opens=java.base/java.nio=ALL-UNNAMED", + ) + .config( + "spark.executor.extraJavaOptions", + "--add-opens=java.base/java.nio=ALL-UNNAMED", + ) + .getOrCreate() + ) + + # The example cdylib (libdatafusion_java_ffi_example.{so,dylib}) is loaded + # by FfiTableProviderExampleNative from examples/native/target. As long as + # PySpark is launched from the repo root the relative-path search succeeds; + # otherwise set example.ffi.lib.path via spark.driver.extraJavaOptions. + os.chdir(REPO_ROOT) + + df = ( + spark.read.format("datafusion") + .option( + "df.factory", + "org.apache.datafusion.examples.ExampleFfiProviderFactory", + ) + .load() + ) + + print("=== schema ===") + df.printSchema() + + print("=== full scan ===") + df.show(truncate=False) + + print("=== filter pushdown: value > 2.0 ===") + df.filter("value > 2.0").show(truncate=False) + + print("=== projection: id, name ===") + df.select("id", "name").show(truncate=False) + + spark.stop() + + +if __name__ == "__main__": + main() diff --git a/examples/src/main/java/org/apache/datafusion/examples/ExampleFfiProviderFactory.java b/examples/src/main/java/org/apache/datafusion/examples/ExampleFfiProviderFactory.java new file mode 100644 index 0000000..8e6e30a --- /dev/null +++ b/examples/src/main/java/org/apache/datafusion/examples/ExampleFfiProviderFactory.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datafusion.examples; + +import java.util.Map; + +import io.datafusion.spark.FfiProviderFactory; + +/** + * Minimal {@link FfiProviderFactory} that exposes the example {@code MemTable} produced by {@link + * FfiTableProviderExampleNative#createMemTableProvider()} as a Spark DataSource V2 source. + * + *

Wire it into PySpark with: + * + *

{@code
+ * df = (spark.read.format("datafusion")
+ *         .option("df.factory", "org.apache.datafusion.examples.ExampleFfiProviderFactory")
+ *         .load())
+ * }
+ * + *

No driver-side options are interpreted — the underlying {@code MemTable} is hard-coded in the + * cdylib at {@code examples/native}. A single partition (id {@code "p0"}) is reported so Spark + * spawns one task; the executor calls {@link #createProvider(byte[])} to obtain a fresh {@code + * FFI_TableProvider} pointer, hands it to {@link + * org.apache.datafusion.SessionContext#registerFfiTable(String, long)}, and streams the resulting + * Arrow record batches back into the Spark scan. + */ +public final class ExampleFfiProviderFactory implements FfiProviderFactory { + + public ExampleFfiProviderFactory() {} + + @Override + public byte[] encodeOptions(Map sparkOptions) { + return new byte[0]; + } + + @Override + public String[] listPartitions(byte[] optionsProtoBytes) { + return new String[] {"p0"}; + } + + @Override + public long createProvider(byte[] optionsProtoBytes) { + return FfiTableProviderExampleNative.createMemTableProvider(); + } +} diff --git a/pom.xml b/pom.xml index 6210841..282feb9 100644 --- a/pom.xml +++ b/pom.xml @@ -32,6 +32,7 @@ under the License. core + spark examples @@ -159,6 +160,7 @@ under the License. README.md CONTRIBUTING.md docs/** + **/*.md .gitignore .idea/** @@ -175,8 +177,11 @@ under the License. **/target/** native/target/** tpch-data/** - + native/Cargo.lock + **/Cargo.lock + + **/META-INF/services/** dev/release/rat_exclude_files.txt diff --git a/spark/native/Cargo.lock b/spark/native/Cargo.lock new file mode 100644 index 0000000..d22d26e --- /dev/null +++ b/spark/native/Cargo.lock @@ -0,0 +1,3655 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "abi_stable" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69d6512d3eb05ffe5004c59c206de7f99c34951504056ce23fc953842f12c445" +dependencies = [ + "abi_stable_derive", + "abi_stable_shared", + "const_panic", + "core_extensions", + "crossbeam-channel", + "generational-arena", + "libloading", + "lock_api", + "parking_lot", + "paste", + "repr_offset", + "rustc_version", + "serde", + "serde_derive", + "serde_json", +] + +[[package]] +name = "abi_stable_derive" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7178468b407a4ee10e881bc7a328a65e739f0863615cca4429d43916b05e898" +dependencies = [ + "abi_stable_shared", + "as_derive_utils", + "core_extensions", + "proc-macro2", + "quote", + "rustc_version", + "syn 1.0.109", + "typed-arena", +] + +[[package]] +name = "abi_stable_shared" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2b5df7688c123e63f4d4d649cba63f2967ba7f7861b1664fca3f77d3dad2b63" +dependencies = [ + "core_extensions", +] + +[[package]] +name = "adler2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" + +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if", + "const-random", + "getrandom 0.3.4", + "once_cell", + "version_check", + "zerocopy", +] + +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "alloc-no-stdlib" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3" + +[[package]] +name = "alloc-stdlib" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece" +dependencies = [ + "alloc-no-stdlib", +] + +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "anyhow" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" + +[[package]] +name = "ar_archive_writer" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4087686b4b0a3427190bae57a1d9a478dbb2d40c5dc1bd6e2b6d797913bdd348" +dependencies = [ + "object", +] + +[[package]] +name = "arrayref" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" + +[[package]] +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + +[[package]] +name = "arrow" +version = "58.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "378530e55cd479eda3c14eb345310799717e6f76d0c332041e8487022166b471" +dependencies = [ + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-csv", + "arrow-data", + "arrow-ipc", + "arrow-json", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "arrow-string", +] + +[[package]] +name = "arrow-arith" +version = "58.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0ab212d2c1886e802f51c5212d78ebbcbb0bec980fff9dadc1eb8d45cd0b738" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "num-traits", +] + +[[package]] +name = "arrow-array" +version = "58.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfd33d3e92f207444098c75b42de99d329562be0cf686b307b097cc52b4e999e" +dependencies = [ + "ahash", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "chrono-tz", + "half", + "hashbrown 0.17.1", + "num-complex", + "num-integer", + "num-traits", +] + +[[package]] +name = "arrow-buffer" +version = "58.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c6cd424c2693bcdbc150d843dc9d4d137dd2de4782ce6df491ad11a3a0416c0" +dependencies = [ + "bytes", + "half", + "num-bigint", + "num-traits", +] + +[[package]] +name = "arrow-cast" +version = "58.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c5aefb56a2c02e9e2b30746241058b85f8983f0fcff2ba0c6d09006e1cded7f" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-ord", + "arrow-schema", + "arrow-select", + "atoi", + "base64", + "chrono", + "comfy-table", + "half", + "lexical-core", + "num-traits", + "ryu", +] + +[[package]] +name = "arrow-csv" +version = "58.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e94e8cf7e517657a52b91ea1263acf38c4ca62a84655d72458a3359b12ab97de" +dependencies = [ + "arrow-array", + "arrow-cast", + "arrow-schema", + "chrono", + "csv", + "csv-core", + "regex", +] + +[[package]] +name = "arrow-data" +version = "58.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c88210023a2bfee1896af366309a3028fc3bcbd6515fa29a7990ee1baa08ee0" +dependencies = [ + "arrow-buffer", + "arrow-schema", + "half", + "num-integer", + "num-traits", +] + +[[package]] +name = "arrow-ipc" +version = "58.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "238438f0834483703d88896db6fe5a7138b2230debc31b34c0336c2996e3c64f" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "flatbuffers", + "lz4_flex", + "zstd", +] + +[[package]] +name = "arrow-json" +version = "58.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "205ca2119e6d679d5c133c6f30e68f027738d95ed948cf77677ea69c7800036b" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-ord", + "arrow-schema", + "arrow-select", + "chrono", + "half", + "indexmap", + "itoa", + "lexical-core", + "memchr", + "num-traits", + "ryu", + "serde_core", + "serde_json", + "simdutf8", +] + +[[package]] +name = "arrow-ord" +version = "58.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bffd8fd2579286a5d63bac898159873e5094a79009940bcb42bbfce4f19f1d0" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", +] + +[[package]] +name = "arrow-row" +version = "58.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bab5994731204603c73ba69267616c50f80780774c6bb0476f1f830625115e0c" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "half", +] + +[[package]] +name = "arrow-schema" +version = "58.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f633dbfdf39c039ada1bf9e34c694816eb71fbb7dc78f613993b7245e078a1ed" +dependencies = [ + "bitflags", + "serde_core", + "serde_json", +] + +[[package]] +name = "arrow-select" +version = "58.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8cd065c54172ac787cf3f2f8d4107e0d3fdc26edba76fdf4f4cc170258942222" +dependencies = [ + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "num-traits", +] + +[[package]] +name = "arrow-string" +version = "58.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29dd7cda3ab9692f43a2e4acc444d760cc17b12bb6d8232ddf64e9bab7c06b42" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "memchr", + "num-traits", + "regex", + "regex-syntax", +] + +[[package]] +name = "as_derive_utils" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff3c96645900a44cf11941c111bd08a6573b0e2f9f69bc9264b179d8fae753c4" +dependencies = [ + "core_extensions", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "async-compression" +version = "0.4.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e79b3f8a79cccc2898f31920fc69f304859b3bd567490f75ebf51ae1c792a9ac" +dependencies = [ + "compression-codecs", + "compression-core", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "async-ffi" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4de21c0feef7e5a556e51af767c953f0501f7f300ba785cc99c47bdc8081a50" +dependencies = [ + "abi_stable", +] + +[[package]] +name = "async-trait" +version = "0.1.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "atoi" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" +dependencies = [ + "num-traits", +] + +[[package]] +name = "autocfg" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2032f911046de80f0a198e0901378627c33f59ea0ac00e363d481118bd70a53" + +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + +[[package]] +name = "bigdecimal" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d6867f1565b3aad85681f1015055b087fcfd840d6aeee6eee7f2da317603695" +dependencies = [ + "autocfg", + "libm", + "num-bigint", + "num-integer", + "num-traits", +] + +[[package]] +name = "bitflags" +version = "2.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8" + +[[package]] +name = "blake2" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46502ad458c9a52b69d4d4d32775c788b7a1b85e8bc9d482d92250fc0e3f8efe" +dependencies = [ + "digest", +] + +[[package]] +name = "blake3" +version = "1.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0aa83c34e62843d924f905e0f5c866eb1dd6545fc4d719e803d9ba6030371fce" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if", + "constant_time_eq", + "cpufeatures 0.3.0", +] + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "brotli" +version = "8.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8119e4516436f5708bbc474a9d395bf12f1b5395e93a92a56e647ac3388c8610" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", + "brotli-decompressor", +] + +[[package]] +name = "brotli-decompressor" +version = "5.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5962523e1b92ce1b5e793d9169b9943eece10d39f62550bc04bb605d75b94924" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", +] + +[[package]] +name = "bumpalo" +version = "3.20.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72f5acc6cb2ba439de613abc23857ec3d78374d8ed5ac84e9d11336e87da8649" + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "bytes" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" + +[[package]] +name = "bzip2" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3a53fac24f34a81bc9954b5d6cfce0c21e18ec6959f44f56e8e90e4bb7c346c" +dependencies = [ + "libbz2-rs-sys", +] + +[[package]] +name = "cc" +version = "1.2.63" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "556e016178bb5662a08681bbe0f00f8e17631781a4dfc8c45e466e4b185ec27f" +dependencies = [ + "find-msvc-tools", + "jobserver", + "libc", + "shlex", +] + +[[package]] +name = "cesu8" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c" + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "chrono" +version = "0.4.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1aa79e62e7697b8e29b513a68abacf485adcd1fe8284a4316c5ae868e6633327" +dependencies = [ + "iana-time-zone", + "num-traits", + "windows-link", +] + +[[package]] +name = "chrono-tz" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6139a8597ed92cf816dfb33f5dd6cf0bb93a6adc938f11039f371bc5bcd26c3" +dependencies = [ + "chrono", + "phf", +] + +[[package]] +name = "combine" +version = "4.6.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba5a308b75df32fe02788e748662718f03fde005016435c444eea572398219fd" +dependencies = [ + "bytes", + "memchr", +] + +[[package]] +name = "comfy-table" +version = "7.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "958c5d6ecf1f214b4c2bbbbf6ab9523a864bd136dcf71a7e8904799acfe1ad47" +dependencies = [ + "unicode-segmentation", + "unicode-width", +] + +[[package]] +name = "compression-codecs" +version = "0.4.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce2548391e9c1929c21bf6aa2680af86fe4c1b33e6cea9ac1cfeec0bd11218cf" +dependencies = [ + "bzip2", + "compression-core", + "flate2", + "liblzma", + "memchr", + "zstd", + "zstd-safe", +] + +[[package]] +name = "compression-core" +version = "0.4.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc14f565cf027a105f7a44ccf9e5b424348421a1d8952a8fc9d499d313107789" + +[[package]] +name = "const-random" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359" +dependencies = [ + "const-random-macro", +] + +[[package]] +name = "const-random-macro" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" +dependencies = [ + "getrandom 0.2.17", + "once_cell", + "tiny-keccak", +] + +[[package]] +name = "const_panic" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e262cdaac42494e3ae34c43969f9cdeb7da178bdb4b66fa6a1ea2edb4c8ae652" +dependencies = [ + "typewit", +] + +[[package]] +name = "constant_time_eq" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" + +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + +[[package]] +name = "core_extensions" +version = "1.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42bb5e5d0269fd4f739ea6cedaf29c16d81c27a7ce7582008e90eb50dcd57003" +dependencies = [ + "core_extensions_proc_macros", +] + +[[package]] +name = "core_extensions_proc_macros" +version = "1.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "533d38ecd2709b7608fb8e18e4504deb99e9a72879e6aa66373a76d8dc4259ea" + +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + +[[package]] +name = "cpufeatures" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201" +dependencies = [ + "libc", +] + +[[package]] +name = "crc32fast" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crunchy" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" + +[[package]] +name = "crypto-common" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "csv" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde_core", +] + +[[package]] +name = "csv-core" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782" +dependencies = [ + "memchr", +] + +[[package]] +name = "dashmap" +version = "6.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6361d5c062261c78a176addb82d4c821ae42bed6089de0e12603cd25de2059c" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core", +] + +[[package]] +name = "datafusion" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93db0e623840612f7f2cd757f7e8a8922064192363732c88692e0870016e141b" +dependencies = [ + "arrow", + "arrow-schema", + "async-trait", + "bytes", + "bzip2", + "chrono", + "datafusion-catalog", + "datafusion-catalog-listing", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-datasource-arrow", + "datafusion-datasource-csv", + "datafusion-datasource-json", + "datafusion-datasource-parquet", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-functions", + "datafusion-functions-aggregate", + "datafusion-functions-nested", + "datafusion-functions-table", + "datafusion-functions-window", + "datafusion-optimizer", + "datafusion-physical-expr", + "datafusion-physical-expr-adapter", + "datafusion-physical-expr-common", + "datafusion-physical-optimizer", + "datafusion-physical-plan", + "datafusion-session", + "datafusion-sql", + "flate2", + "futures", + "itertools", + "liblzma", + "log", + "object_store", + "parking_lot", + "parquet", + "rand", + "regex", + "sqlparser", + "tempfile", + "tokio", + "url", + "uuid", + "zstd", +] + +[[package]] +name = "datafusion-catalog" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37cefde60b26a7f4ff61e9d2ff2833322f91df2b568d7238afe67bde5bdffb66" +dependencies = [ + "arrow", + "async-trait", + "dashmap", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "itertools", + "log", + "object_store", + "parking_lot", + "tokio", +] + +[[package]] +name = "datafusion-catalog-listing" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17e112307715d6a7a331111a4c2330ff54bc237183511c319e3708a4cff431fb" +dependencies = [ + "arrow", + "async-trait", + "datafusion-catalog", + "datafusion-common", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-adapter", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "futures", + "itertools", + "log", + "object_store", +] + +[[package]] +name = "datafusion-common" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d72a11ca44a95e1081870d3abb80c717496e8a7acb467a1d3e932bb636af5cc2" +dependencies = [ + "ahash", + "arrow", + "arrow-ipc", + "chrono", + "half", + "hashbrown 0.16.1", + "indexmap", + "itertools", + "libc", + "log", + "object_store", + "parquet", + "paste", + "recursive", + "sqlparser", + "tokio", + "web-time", +] + +[[package]] +name = "datafusion-common-runtime" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89f4afaed29670ec4fd6053643adc749fe3f4bc9d1ce1b8c5679b22c67d12def" +dependencies = [ + "futures", + "log", + "tokio", +] + +[[package]] +name = "datafusion-datasource" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9fb386e1691355355a96419978a0022b7947b44d4a24a6ea99f00b6b485cbb6" +dependencies = [ + "arrow", + "async-compression", + "async-trait", + "bytes", + "bzip2", + "chrono", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-adapter", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "flate2", + "futures", + "glob", + "itertools", + "liblzma", + "log", + "object_store", + "rand", + "tokio", + "tokio-util", + "url", + "zstd", +] + +[[package]] +name = "datafusion-datasource-arrow" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffa6c52cfed0734c5f93754d1c0175f558175248bf686c944fb05c373e5fc096" +dependencies = [ + "arrow", + "arrow-ipc", + "async-trait", + "bytes", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "itertools", + "object_store", + "tokio", +] + +[[package]] +name = "datafusion-datasource-csv" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "503f29e0582c1fc189578d665ff57d9300da1f80c282777d7eb67bb79fb8cdca" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "object_store", + "regex", + "tokio", +] + +[[package]] +name = "datafusion-datasource-json" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e33804749abc8d0c8cb7473228483cb8070e524c6f6086ee1b85a64debe2b3d2" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "object_store", + "serde_json", + "tokio", + "tokio-stream", +] + +[[package]] +name = "datafusion-datasource-parquet" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a8e0365e0e08e8ff94d912f0ababcf9065a1a304018ba90b1fc83c855b4997" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-aggregate-common", + "datafusion-physical-expr", + "datafusion-physical-expr-adapter", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-pruning", + "datafusion-session", + "futures", + "itertools", + "log", + "object_store", + "parking_lot", + "parquet", + "tokio", +] + +[[package]] +name = "datafusion-doc" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8de6ac0df1662b9148ad3c987978b32cbec7c772f199b1d53520c8fa764a87ee" + +[[package]] +name = "datafusion-execution" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c03c7fbdaefcca4ef6ffe425a5fc2325763bfb426599bb0bf4536466efabe709" +dependencies = [ + "arrow", + "arrow-buffer", + "async-trait", + "chrono", + "dashmap", + "datafusion-common", + "datafusion-expr", + "datafusion-physical-expr-common", + "futures", + "log", + "object_store", + "parking_lot", + "rand", + "tempfile", + "url", +] + +[[package]] +name = "datafusion-expr" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "574b9b6977fedbd2a611cbff12e5caf90f31640ad9dc5870f152836d94bad0dd" +dependencies = [ + "arrow", + "async-trait", + "chrono", + "datafusion-common", + "datafusion-doc", + "datafusion-expr-common", + "datafusion-functions-aggregate-common", + "datafusion-functions-window-common", + "datafusion-physical-expr-common", + "indexmap", + "itertools", + "paste", + "recursive", + "serde_json", + "sqlparser", +] + +[[package]] +name = "datafusion-expr-common" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d7c3adf3db8bf61e92eb90cb659c8e8b734593a8f7c8e12a843c7ddba24b87e" +dependencies = [ + "arrow", + "datafusion-common", + "indexmap", + "itertools", + "paste", +] + +[[package]] +name = "datafusion-ffi" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b95173344d04ba62755c949bf44f8d1a6e4414cf6392a635db96c07e711b9a3c" +dependencies = [ + "abi_stable", + "arrow", + "arrow-schema", + "async-ffi", + "async-trait", + "datafusion-catalog", + "datafusion-common", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-aggregate-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-proto", + "datafusion-proto-common", + "datafusion-session", + "futures", + "log", + "prost", + "semver", + "tokio", +] + +[[package]] +name = "datafusion-functions" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28aa4e10384e782774b10e72aca4d93ef7b31aa653095d9d4536b0a3dbc51b6" +dependencies = [ + "arrow", + "arrow-buffer", + "base64", + "blake2", + "blake3", + "chrono", + "chrono-tz", + "datafusion-common", + "datafusion-doc", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-macros", + "hex", + "itertools", + "log", + "md-5", + "memchr", + "num-traits", + "rand", + "regex", + "sha2", + "unicode-segmentation", + "uuid", +] + +[[package]] +name = "datafusion-functions-aggregate" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00aa6217e56098ba84e0a338176fe52f0a84cca398021512c6c8c5eff806d0ad" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-doc", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-aggregate-common", + "datafusion-macros", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "half", + "log", + "num-traits", + "paste", +] + +[[package]] +name = "datafusion-functions-aggregate-common" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b511250349407db7c43832ab2de63f5557b19a20dfd236b39ca2c04468b50d47" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-expr-common", + "datafusion-physical-expr-common", +] + +[[package]] +name = "datafusion-functions-nested" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef13a858e20d50f0a9bb5e96e7ac82b4e7597f247515bccca4fdd2992df0212a" +dependencies = [ + "arrow", + "arrow-ord", + "datafusion-common", + "datafusion-doc", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-functions", + "datafusion-functions-aggregate", + "datafusion-functions-aggregate-common", + "datafusion-macros", + "datafusion-physical-expr-common", + "hashbrown 0.16.1", + "itertools", + "itoa", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-table" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b40d3f5bbb3905f9ccb1ce9485a9595c77b69758a7c24d3ba79e334ff51e7e" +dependencies = [ + "arrow", + "async-trait", + "datafusion-catalog", + "datafusion-common", + "datafusion-expr", + "datafusion-physical-plan", + "parking_lot", + "paste", +] + +[[package]] +name = "datafusion-functions-window" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4e88ec9d57c9b685d02f58bfee7be62d72610430ddcedb82a08e5d9925dbfb6" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-doc", + "datafusion-expr", + "datafusion-functions-window-common", + "datafusion-macros", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-window-common" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8307bb93519b1a91913723a1130cfafeee3f72200d870d88e91a6fc5470ede5c" +dependencies = [ + "datafusion-common", + "datafusion-physical-expr-common", +] + +[[package]] +name = "datafusion-macros" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e367e6a71051d0ebdd29b2f85d12059b38b1d1f172c6906e80016da662226bd" +dependencies = [ + "datafusion-doc", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "datafusion-optimizer" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e929015451a67f77d9d8b727b2bf3a40c4445fdef6cdc53281d7d97c76888ace" +dependencies = [ + "arrow", + "chrono", + "datafusion-common", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-physical-expr", + "indexmap", + "itertools", + "log", + "recursive", + "regex", + "regex-syntax", +] + +[[package]] +name = "datafusion-physical-expr" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b1e68aba7a4b350401cfdf25a3d6f989ad898a7410164afe9ca52080244cb59" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-functions-aggregate-common", + "datafusion-physical-expr-common", + "half", + "hashbrown 0.16.1", + "indexmap", + "itertools", + "parking_lot", + "paste", + "petgraph", + "recursive", + "tokio", +] + +[[package]] +name = "datafusion-physical-expr-adapter" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea22315f33cf2e0adc104e8ec42e285f6ed93998d565c65e82fec6a9ee9f9db4" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-expr", + "datafusion-functions", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "itertools", +] + +[[package]] +name = "datafusion-physical-expr-common" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b04b45ea8ad3ac2d78f2ea2a76053e06591c9629c7a603eda16c10649ecf4362" +dependencies = [ + "ahash", + "arrow", + "chrono", + "datafusion-common", + "datafusion-expr-common", + "hashbrown 0.16.1", + "indexmap", + "itertools", + "parking_lot", +] + +[[package]] +name = "datafusion-physical-optimizer" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cb13397809a425918f608dfe8653f332015a3e330004ab191b4404187238b95" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-pruning", + "itertools", + "recursive", +] + +[[package]] +name = "datafusion-physical-plan" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5edc023675791af9d5fb4cc4c24abf5f7bd3bd4dcf9e5bd90ea1eff6976dcc79" +dependencies = [ + "ahash", + "arrow", + "arrow-ord", + "arrow-schema", + "async-trait", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions", + "datafusion-functions-aggregate-common", + "datafusion-functions-window-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "futures", + "half", + "hashbrown 0.16.1", + "indexmap", + "itertools", + "log", + "num-traits", + "parking_lot", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "datafusion-proto" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a387aaef949dc16bb6abc81bd1af850ec7449183aef011214f9724957495738" +dependencies = [ + "arrow", + "chrono", + "datafusion-catalog", + "datafusion-catalog-listing", + "datafusion-common", + "datafusion-datasource", + "datafusion-datasource-arrow", + "datafusion-datasource-csv", + "datafusion-datasource-json", + "datafusion-datasource-parquet", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-table", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-proto-common", + "object_store", + "prost", + "rand", +] + +[[package]] +name = "datafusion-proto-common" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16e614c7c53a9c304c6a850b821010bb492e57300311835f1180613f9d2c63d9" +dependencies = [ + "arrow", + "datafusion-common", + "prost", +] + +[[package]] +name = "datafusion-pruning" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac8c76860e355616555081cab5968cec1af7a80701ff374510860bcd567e365a" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-datasource", + "datafusion-expr-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "itertools", + "log", +] + +[[package]] +name = "datafusion-session" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5412111aa48e2424ba926112e192f7a6b7e4ccb450145d25ce5ede9f19dc491e" +dependencies = [ + "async-trait", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-plan", + "parking_lot", +] + +[[package]] +name = "datafusion-spark-helper" +version = "0.1.0" +dependencies = [ + "arrow", + "async-trait", + "datafusion", + "datafusion-ffi", + "futures", + "jni", + "tokio", +] + +[[package]] +name = "datafusion-sql" +version = "53.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa0d133ddf8b9b3b872acac900157f783e7b879fe9a6bccf389abebbfac45ec1" +dependencies = [ + "arrow", + "bigdecimal", + "chrono", + "datafusion-common", + "datafusion-expr", + "datafusion-functions-nested", + "indexmap", + "log", + "recursive", + "regex", + "sqlparser", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", + "subtle", +] + +[[package]] +name = "displaydoc" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ac70aa55017e108007fbaf5aa0f54b021c98f92ff8af59d42eda9da96e3dd4f" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "either" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e" + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys 0.61.2", +] + +[[package]] +name = "fastrand" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" + +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + +[[package]] +name = "fixedbitset" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" + +[[package]] +name = "flatbuffers" +version = "25.12.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35f6839d7b3b98adde531effaf34f0c2badc6f4735d26fe74709d8e513a96ef3" +dependencies = [ + "bitflags", + "rustc_version", +] + +[[package]] +name = "flate2" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" +dependencies = [ + "crc32fast", + "miniz_oxide", + "zlib-rs", +] + +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + +[[package]] +name = "form_urlencoded" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "futures" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" + +[[package]] +name = "futures-executor" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718" + +[[package]] +name = "futures-macro" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "futures-sink" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893" + +[[package]] +name = "futures-task" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" + +[[package]] +name = "futures-util" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "slab", +] + +[[package]] +name = "generational-arena" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877e94aff08e743b651baaea359664321055749b398adff8740a7399af7796e7" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "getrandom" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "libc", + "r-efi 5.3.0", + "wasip2", +] + +[[package]] +name = "getrandom" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" +dependencies = [ + "cfg-if", + "libc", + "r-efi 6.0.0", + "wasip2", + "wasip3", +] + +[[package]] +name = "glob" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" + +[[package]] +name = "half" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" +dependencies = [ + "cfg-if", + "crunchy", + "num-traits", + "zerocopy", +] + +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" + +[[package]] +name = "hashbrown" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "foldhash 0.1.5", +] + +[[package]] +name = "hashbrown" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash 0.2.0", +] + +[[package]] +name = "hashbrown" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + +[[package]] +name = "http" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6970f50e31d6fc17d3fa27329444bfa74e196cf62e95052a3f6fee181dba6425" +dependencies = [ + "bytes", + "itoa", +] + +[[package]] +name = "humantime" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424" + +[[package]] +name = "iana-time-zone" +version = "0.1.65" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e31bc9ad994ba00e440a8aa5c9ef0ec67d5cb5e5cb0cc7f8b744a35b389cc470" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "log", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "icu_collections" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2984d1cd16c883d7935b9e07e44071dca8d917fd52ecc02c04d5fa0b5a3f191c" +dependencies = [ + "displaydoc", + "potential_utf", + "utf8_iter", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locale_core" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92219b62b3e2b4d88ac5119f8904c10f8f61bf7e95b640d25ba3075e6cac2c29" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_normalizer" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c56e5ee99d6e3d33bd91c5d85458b6005a22140021cc324cea84dd0e72cff3b4" +dependencies = [ + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da3be0ae77ea334f4da67c12f149704f19f81d1adf7c51cf482943e84a2bad38" + +[[package]] +name = "icu_properties" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bee3b67d0ea5c2cca5003417989af8996f8604e34fb9ddf96208a033901e70de" +dependencies = [ + "icu_collections", + "icu_locale_core", + "icu_properties_data", + "icu_provider", + "zerotrie", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e2bbb201e0c04f7b4b3e14382af113e17ba4f63e2c9d2ee626b720cbce54a14" + +[[package]] +name = "icu_provider" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "139c4cf31c8b5f33d7e199446eff9c1e02decfc2f0eec2c8d71f65befa45b421" +dependencies = [ + "displaydoc", + "icu_locale_core", + "writeable", + "yoke", + "zerofrom", + "zerotrie", + "zerovec", +] + +[[package]] +name = "id-arena" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" + +[[package]] +name = "idna" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb68373c0d6620ef8105e855e7745e18b0d00d3bdb07fb532e434244cdb9a714" +dependencies = [ + "icu_normalizer", + "icu_properties", +] + +[[package]] +name = "indexmap" +version = "2.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" +dependencies = [ + "equivalent", + "hashbrown 0.17.1", + "serde", + "serde_core", +] + +[[package]] +name = "integer-encoding" +version = "3.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" + +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + +[[package]] +name = "jni" +version = "0.21.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a87aa2bb7d2af34197c04845522473242e1aa17c12f4935d5856491a7fb8c97" +dependencies = [ + "cesu8", + "cfg-if", + "combine", + "jni-sys 0.3.1", + "log", + "thiserror 1.0.69", + "walkdir", + "windows-sys 0.45.0", +] + +[[package]] +name = "jni-sys" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41a652e1f9b6e0275df1f15b32661cf0d4b78d4d87ddec5e0c3c20f097433258" +dependencies = [ + "jni-sys 0.4.1", +] + +[[package]] +name = "jni-sys" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6377a88cb3910bee9b0fa88d4f42e1d2da8e79915598f65fb0c7ee14c878af2" +dependencies = [ + "jni-sys-macros", +] + +[[package]] +name = "jni-sys-macros" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38c0b942f458fe50cdac086d2f946512305e5631e720728f2a61aabcd47a6264" +dependencies = [ + "quote", + "syn 2.0.117", +] + +[[package]] +name = "jobserver" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom 0.3.4", + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2025f20d7a4fa7785846e7b63d10a76d3f1cee98ee5cb79ea59703f95e42162" +dependencies = [ + "cfg-if", + "futures-util", + "wasm-bindgen", +] + +[[package]] +name = "leb128fmt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" + +[[package]] +name = "lexical-core" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d8d125a277f807e55a77304455eb7b1cb52f2b18c143b60e766c120bd64a594" +dependencies = [ + "lexical-parse-float", + "lexical-parse-integer", + "lexical-util", + "lexical-write-float", + "lexical-write-integer", +] + +[[package]] +name = "lexical-parse-float" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52a9f232fbd6f550bc0137dcb5f99ab674071ac2d690ac69704593cb4abbea56" +dependencies = [ + "lexical-parse-integer", + "lexical-util", +] + +[[package]] +name = "lexical-parse-integer" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a7a039f8fb9c19c996cd7b2fcce303c1b2874fe1aca544edc85c4a5f8489b34" +dependencies = [ + "lexical-util", +] + +[[package]] +name = "lexical-util" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2604dd126bb14f13fb5d1bd6a66155079cb9fa655b37f875b3a742c705dbed17" + +[[package]] +name = "lexical-write-float" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50c438c87c013188d415fbabbb1dceb44249ab81664efbd31b14ae55dabb6361" +dependencies = [ + "lexical-util", + "lexical-write-integer", +] + +[[package]] +name = "lexical-write-integer" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "409851a618475d2d5796377cad353802345cba92c867d9fbcde9cf4eac4e14df" +dependencies = [ + "lexical-util", +] + +[[package]] +name = "libbz2-rs-sys" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34b357333733e8260735ba5894eb928c02ecc69c78715f01a8019e7fa7f2db4c" + +[[package]] +name = "libc" +version = "0.2.186" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" + +[[package]] +name = "libloading" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" +dependencies = [ + "cfg-if", + "winapi", +] + +[[package]] +name = "liblzma" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6033b77c21d1f56deeae8014eb9fbe7bdf1765185a6c508b5ca82eeaed7f899" +dependencies = [ + "liblzma-sys", +] + +[[package]] +name = "liblzma-sys" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a60851d15cd8c5346eca4ab8babff585be2ae4bc8097c067291d3ffe2add3b6" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + +[[package]] +name = "libm" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" + +[[package]] +name = "linux-raw-sys" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" + +[[package]] +name = "litemap" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0" + +[[package]] +name = "lock_api" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" +dependencies = [ + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "953f07c43838f8e6f9758cab68bf5bed85465e7587ebe0b823f1bcd81978ad3a" + +[[package]] +name = "lz4_flex" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ef0d4ed8669f8f8826eb00dc878084aa8f253506c4fd5e8f58f5bce72ddb97e" +dependencies = [ + "twox-hash", +] + +[[package]] +name = "md-5" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +dependencies = [ + "cfg-if", + "digest", +] + +[[package]] +name = "memchr" +version = "2.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8" + +[[package]] +name = "miniz_oxide" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" +dependencies = [ + "adler2", + "simd-adler32", +] + +[[package]] +name = "num-bigint" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", + "num-traits", +] + +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", + "libm", +] + +[[package]] +name = "object" +version = "0.37.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe" +dependencies = [ + "memchr", +] + +[[package]] +name = "object_store" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "622acbc9100d3c10e2ee15804b0caa40e55c933d5aa53814cd520805b7958a49" +dependencies = [ + "async-trait", + "bytes", + "chrono", + "futures-channel", + "futures-core", + "futures-util", + "http", + "humantime", + "itertools", + "parking_lot", + "percent-encoding", + "thiserror 2.0.18", + "tokio", + "tracing", + "url", + "walkdir", + "wasm-bindgen-futures", + "web-time", +] + +[[package]] +name = "once_cell" +version = "1.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" + +[[package]] +name = "ordered-float" +version = "2.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c" +dependencies = [ + "num-traits", +] + +[[package]] +name = "parking_lot" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-link", +] + +[[package]] +name = "parquet" +version = "58.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5dafa7d01085b62a47dd0c1829550a0a36710ea9c4fe358a05a85477cec8a908" +dependencies = [ + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-ipc", + "arrow-schema", + "arrow-select", + "base64", + "brotli", + "bytes", + "chrono", + "flate2", + "futures", + "half", + "hashbrown 0.17.1", + "lz4_flex", + "num-bigint", + "num-integer", + "num-traits", + "object_store", + "paste", + "seq-macro", + "simdutf8", + "snap", + "thrift", + "tokio", + "twox-hash", + "zstd", +] + +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + +[[package]] +name = "percent-encoding" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" + +[[package]] +name = "petgraph" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455" +dependencies = [ + "fixedbitset", + "hashbrown 0.15.5", + "indexmap", + "serde", +] + +[[package]] +name = "phf" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "913273894cec178f401a31ec4b656318d95473527be05c0752cc41cdc32be8b7" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_shared" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06005508882fb681fd97892ecff4b7fd0fee13ef1aa569f8695dae7ab9099981" +dependencies = [ + "siphasher", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" + +[[package]] +name = "pkg-config" +version = "0.3.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e" + +[[package]] +name = "potential_utf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0103b1cef7ec0cf76490e969665504990193874ea05c85ff9bab8b911d0a0564" +dependencies = [ + "zerovec", +] + +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn 2.0.117", +] + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "prost" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "528ac67416ff8646872a3c02cad9cc4ee5dc9f9540c9b10771855c95cb2e5ae1" +dependencies = [ + "bytes", + "prost-derive", +] + +[[package]] +name = "prost-derive" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b570b25f7617e43d59005d0990ccb79e950a423952cea19671b7a876da390adf" +dependencies = [ + "anyhow", + "itertools", + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "psm" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "645dbe486e346d9b5de3ef16ede18c26e6c70ad97418f4874b8b1889d6e761ea" +dependencies = [ + "ar_archive_writer", + "cc", +] + +[[package]] +name = "quote" +version = "1.0.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + +[[package]] +name = "r-efi" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" + +[[package]] +name = "rand" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" +dependencies = [ + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom 0.3.4", +] + +[[package]] +name = "recursive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e" +dependencies = [ + "recursive-proc-macro-impl", + "stacker", +] + +[[package]] +name = "recursive-proc-macro-impl" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" +dependencies = [ + "quote", + "syn 2.0.117", +] + +[[package]] +name = "redox_syscall" +version = "0.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" +dependencies = [ + "bitflags", +] + +[[package]] +name = "regex" +version = "1.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1292b7759ae1cb9ec195452d1390a074f0cd8541ab7a5a8c31cd6db45d4a6ba" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4" + +[[package]] +name = "repr_offset" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb1070755bd29dffc19d0971cab794e607839ba2ef4b69a9e6fbc8733c1b72ea" +dependencies = [ + "tstr", +] + +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + +[[package]] +name = "rustix" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.61.2", +] + +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + +[[package]] +name = "ryu" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "semver" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" + +[[package]] +name = "seq-macro" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc" + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "serde_json" +version = "1.0.150" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9" +dependencies = [ + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "sha2" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if", + "cpufeatures 0.2.17", + "digest", +] + +[[package]] +name = "shlex" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8fadd59c855ef2080decdef8ff161eb6661b86933c9d82e5ba29dc602a55aba" + +[[package]] +name = "simd-adler32" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214" + +[[package]] +name = "simdutf8" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" + +[[package]] +name = "siphasher" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ee5873ec9cce0195efcb7a4e9507a04cd49aec9c83d0389df45b1ef7ba2e649" + +[[package]] +name = "slab" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "snap" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" + +[[package]] +name = "sqlparser" +version = "0.61.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbf5ea8d4d7c808e1af1cbabebca9a2abe603bcefc22294c5b95018d53200cb7" +dependencies = [ + "log", + "recursive", + "sqlparser_derive", +] + +[[package]] +name = "sqlparser_derive" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6dd45d8fc1c79299bfbb7190e42ccbbdf6a5f52e4a6ad98d92357ea965bd289" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "stable_deref_trait" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" + +[[package]] +name = "stacker" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "640c8cdd92b6b12f5bcb1803ca3bbf5ab96e5e6b6b96b9ab77dabe9e880b3190" +dependencies = [ + "cc", + "cfg-if", + "libc", + "psm", + "windows-sys 0.61.2", +] + +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "synstructure" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "tempfile" +version = "3.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" +dependencies = [ + "fastrand", + "getrandom 0.4.2", + "once_cell", + "rustix", + "windows-sys 0.61.2", +] + +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" +dependencies = [ + "thiserror-impl 2.0.18", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "thrift" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" +dependencies = [ + "byteorder", + "integer-encoding", + "ordered-float", +] + +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + +[[package]] +name = "tinystr" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8323304221c2a851516f22236c5722a72eaa19749016521d6dff0824447d96d" +dependencies = [ + "displaydoc", + "zerovec", +] + +[[package]] +name = "tokio" +version = "1.52.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fc7f01b389ac15039e4dc9531aa973a135d7a4135281b12d7c1bc79fd57fffe" +dependencies = [ + "bytes", + "pin-project-lite", + "tokio-macros", +] + +[[package]] +name = "tokio-macros" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "tokio-stream" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32da49809aab5c3bc678af03902d4ccddea2a87d028d86392a4b1560c6906c70" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", + "tokio-util", +] + +[[package]] +name = "tokio-util" +version = "0.7.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tracing" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" +dependencies = [ + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "tracing-core" +version = "0.1.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" +dependencies = [ + "once_cell", +] + +[[package]] +name = "tstr" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f8e0294f14baae476d0dd0a2d780b2e24d66e349a9de876f5126777a37bdba7" +dependencies = [ + "tstr_proc_macros", +] + +[[package]] +name = "tstr_proc_macros" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e78122066b0cb818b8afd08f7ed22f7fdbc3e90815035726f0840d0d26c0747a" + +[[package]] +name = "twox-hash" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c" + +[[package]] +name = "typed-arena" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6af6ae20167a9ece4bcb41af5b80f8a1f1df981f6391189ce00fd257af04126a" + +[[package]] +name = "typenum" +version = "1.20.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6f5e870be6c3b371b77fe0ee0bafb859fa4964b4404c27de1d380043c4dda20" + +[[package]] +name = "typewit" +version = "1.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "214ca0b2191785cbc06209b9ca1861e048e39b5ba33574b3cedd58363d5bb5f6" + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "unicode-segmentation" +version = "1.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6f5d3c3b1bf09027a88a6bc961fc00497d651009560b5463668dc81b0fa87a8" + +[[package]] +name = "unicode-width" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" + +[[package]] +name = "unicode-xid" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" + +[[package]] +name = "url" +version = "2.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", + "serde", +] + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + +[[package]] +name = "uuid" +version = "1.23.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "144d6b123cef80b301b8f72a9e2ca4370ddec21950d0a103dd22c437006d2db7" +dependencies = [ + "getrandom 0.4.2", + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + +[[package]] +name = "wasip2" +version = "1.0.3+wasi-0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6" +dependencies = [ + "wit-bindgen 0.57.1", +] + +[[package]] +name = "wasip3" +version = "0.4.0+wasi-0.3.0-rc-2026-01-06" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" +dependencies = [ + "wit-bindgen 0.51.0", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.123" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a254a4b10c19a76f09a27640e7ffbf9bc30bf67e16a3bf28aaefa4920fe81563" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.73" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54568702fabf5d4849ce2b90fadfa64168a097eaf4b351ce9df8b687a0086aaf" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.123" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24a40fc75b0ec6f3746ceb10d36f53a93dcd68a93b11b6445983945d79eba0dc" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.123" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "908f34bd9b9ce3d4caf07b72dfab63d61504d156856c6bd3cd87fa350cf3985b" +dependencies = [ + "bumpalo", + "proc-macro2", + "quote", + "syn 2.0.117", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.123" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7acbf7616c27b194bbb550bf77ed0c2c3e5b7fd1260a93082b95fb7f47959b92" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "wasm-encoder" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" +dependencies = [ + "leb128fmt", + "wasmparser", +] + +[[package]] +name = "wasm-metadata" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" +dependencies = [ + "anyhow", + "indexmap", + "wasm-encoder", + "wasmparser", +] + +[[package]] +name = "wasmparser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" +dependencies = [ + "bitflags", + "hashbrown 0.15.5", + "indexmap", + "semver", +] + +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-core" +version = "0.62.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-link", + "windows-result", + "windows-strings", +] + +[[package]] +name = "windows-implement" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "windows-interface" +version = "0.59.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-result" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-strings" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-sys" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-targets" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" + +[[package]] +name = "windows_i686_gnu" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" + +[[package]] +name = "windows_i686_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" + +[[package]] +name = "wit-bindgen" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +dependencies = [ + "wit-bindgen-rust-macro", +] + +[[package]] +name = "wit-bindgen" +version = "0.57.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" + +[[package]] +name = "wit-bindgen-core" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" +dependencies = [ + "anyhow", + "heck", + "wit-parser", +] + +[[package]] +name = "wit-bindgen-rust" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" +dependencies = [ + "anyhow", + "heck", + "indexmap", + "prettyplease", + "syn 2.0.117", + "wasm-metadata", + "wit-bindgen-core", + "wit-component", +] + +[[package]] +name = "wit-bindgen-rust-macro" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" +dependencies = [ + "anyhow", + "prettyplease", + "proc-macro2", + "quote", + "syn 2.0.117", + "wit-bindgen-core", + "wit-bindgen-rust", +] + +[[package]] +name = "wit-component" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" +dependencies = [ + "anyhow", + "bitflags", + "indexmap", + "log", + "serde", + "serde_derive", + "serde_json", + "wasm-encoder", + "wasm-metadata", + "wasmparser", + "wit-parser", +] + +[[package]] +name = "wit-parser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" +dependencies = [ + "anyhow", + "id-arena", + "indexmap", + "log", + "semver", + "serde", + "serde_derive", + "serde_json", + "unicode-xid", + "wasmparser", +] + +[[package]] +name = "writeable" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" + +[[package]] +name = "yoke" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "709fe23a0424b6a435d82152b1bd3fdfb0833487d5fa90d05d42762a9891fef5" +dependencies = [ + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", + "synstructure", +] + +[[package]] +name = "zerocopy" +version = "0.8.52" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce1022995ff5ff5d841ad7d994facc23098cd40152f2c1d11cd607c6f530653f" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.52" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ae7f38b72ec2a254e2b87ef277cf2cd4fb97cbebf944faa6f33354da0867930" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "zerofrom" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ec05a11813ea801ff6d75110ad09cd0824ddba17dfe17128ea0d5f68e6c5272" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", + "synstructure", +] + +[[package]] +name = "zerotrie" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f9152d31db0792fa83f70fb2f83148effb5c1f5b8c7686c3459e361d9bc20bf" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", +] + +[[package]] +name = "zerovec" +version = "0.11.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90f911cbc359ab6af17377d242225f4d75119aec87ea711a880987b18cd7b239" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "zlib-rs" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513" + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" + +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.16+zstd.1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/spark/native/Cargo.toml b/spark/native/Cargo.toml new file mode 100644 index 0000000..4650b2a --- /dev/null +++ b/spark/native/Cargo.toml @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 + +[package] +name = "datafusion-spark-helper" +version = "0.1.0" +edition = "2021" +publish = false + +[lib] +# cdylib for the JVM to load via System.load; rlib so Rust-level tests can +# exercise the WideningTableProvider directly without going through JNI. +crate-type = ["cdylib", "rlib"] + +[dependencies] +arrow = { version = "58", features = ["ffi"] } +async-trait = "0.1" +datafusion = { version = "53.1.0" } +datafusion-ffi = "53.1.0" +futures = "0.3" +jni = "0.21" +tokio = { version = "1", features = ["rt-multi-thread"] } diff --git a/spark/native/src/lib.rs b/spark/native/src/lib.rs new file mode 100644 index 0000000..8c50a00 --- /dev/null +++ b/spark/native/src/lib.rs @@ -0,0 +1,128 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Widening cdylib for the generic Spark connector. +//! +//! Single JNI entry point: `wrapWithWidening(jlong) -> jlong`. Takes a raw +//! `FFI_TableProvider` pointer produced by a bridge cdylib, wraps the inner +//! `TableProvider` in a [`WideningTableProvider`] that exposes +//! Spark-compatible Arrow types (UInt*→signed wider, Float16→Float32, +//! Time*→Int wider, Timestamp(*, tz)→Timestamp(Microsecond, tz)), and +//! re-FFIs the result for the consumer (datafusion-java's cdylib). +//! +//! No SessionContext or SQL — kernel-level `arrow::compute::cast` only. + +use std::error::Error; +use std::panic::{catch_unwind, AssertUnwindSafe}; +use std::sync::{Arc, OnceLock}; + +use datafusion::catalog::TableProvider; +use datafusion::execution::TaskContextProvider; +use datafusion::prelude::SessionContext; +use datafusion_ffi::execution::FFI_TaskContextProvider; +use datafusion_ffi::table_provider::FFI_TableProvider; +use jni::objects::JClass; +use jni::sys::jlong; +use jni::JNIEnv; +use tokio::runtime::{Handle, Runtime}; + +pub mod widening; + +use widening::WideningTableProvider; + +type JniResult = Result>; + +/// Shared Tokio runtime. The widening cdylib does not itself await any IO, +/// but the FFI_TableProvider it produces is registered on a foreign +/// SessionContext that may schedule work via this handle. +fn runtime() -> &'static Handle { + static RUNTIME: OnceLock = OnceLock::new(); + RUNTIME + .get_or_init(|| Runtime::new().expect("tokio runtime init failed")) + .handle() +} + +/// Shared "host" SessionContext within the widening cdylib. Only used as +/// the source of a `TaskContextProvider` passed into `FFI_TableProvider::new`. +/// Lives for the lifetime of the cdylib; no datasets are ever registered on it. +fn host_session_context() -> &'static Arc { + static CTX: OnceLock> = OnceLock::new(); + CTX.get_or_init(|| Arc::new(SessionContext::new())) +} + +#[no_mangle] +pub extern "system" fn Java_io_datafusion_spark_FfiHelperNative_wrapWithWidening<'local>( + mut env: JNIEnv<'local>, + _class: JClass<'local>, + ffi_raw_ptr: jlong, +) -> jlong { + try_unwrap_or_throw(&mut env, 0, |_env| { + if ffi_raw_ptr == 0 { + return Err("wrapWithWidening: input FFI_TableProvider pointer is null".into()); + } + // Take ownership of the producer's FFI_TableProvider. + let ffi_raw: Box = + unsafe { Box::from_raw(ffi_raw_ptr as *mut FFI_TableProvider) }; + + // Cross-cdylib hop: `Arc::::from(&FFI_TableProvider)` + // returns a `ForeignTableProvider` wrapper that delegates back through + // the producer's vtable. Drop our `Box` immediately afterward — the + // ForeignTableProvider clone owns its own retained copy. + let inner: Arc = (&*ffi_raw).into(); + drop(ffi_raw); + + let widened: Arc = Arc::new(WideningTableProvider::new(inner)); + + // Re-wrap as an FFI_TableProvider for the consumer. + let ctx_provider: Arc = + Arc::clone(host_session_context()) as Arc; + let ffi_task_ctx = FFI_TaskContextProvider::from(&ctx_provider); + let ffi = FFI_TableProvider::new( + widened, + /*can_support_pushdown_filters=*/ true, + Some(runtime().clone()), + ffi_task_ctx, + /*logical_codec=*/ None, + ); + Ok(Box::into_raw(Box::new(ffi)) as jlong) + }) +} + +/// Run `f`, catching panics and translating `Err` into a plain Java +/// `RuntimeException`. The connector-core helper does not know about +/// datafusion-java's exception hierarchy, so this stays minimal. +fn try_unwrap_or_throw(env: &mut JNIEnv, default: T, f: F) -> T +where + F: FnOnce(&mut JNIEnv) -> JniResult, +{ + match catch_unwind(AssertUnwindSafe(|| f(env))) { + Ok(Ok(value)) => value, + Ok(Err(err)) => { + let _ = env.throw_new("java/lang/RuntimeException", err.to_string()); + default + } + Err(panic) => { + let msg = panic + .downcast_ref::<&'static str>() + .map(|s| s.to_string()) + .or_else(|| panic.downcast_ref::().cloned()) + .unwrap_or_else(|| "rust panic in widening cdylib".to_string()); + let _ = env.throw_new("java/lang/RuntimeException", format!("panic: {msg}")); + default + } + } +} diff --git a/spark/native/src/widening.rs b/spark/native/src/widening.rs new file mode 100644 index 0000000..92874ad --- /dev/null +++ b/spark/native/src/widening.rs @@ -0,0 +1,335 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Kernel-level Arrow type widening for Spark consumption. +//! +//! Spark 3.5's `ArrowColumnVector` has no accessor for unsigned ints, Time*, +//! Float16, or non-microsecond Timestamp. The widening machinery here wraps +//! an inner `TableProvider` with one that exposes a "widened" schema — +//! UInt*→Int wider, Float16→Float32, Time*→Int wider, Timestamp(*, tz)→ +//! Timestamp(Microsecond, tz), recursing into List/LargeList/FixedSizeList +//! children — and applies `arrow::compute::cast` to each produced +//! RecordBatch column-wise. No SQL, no SessionContext, no view machinery. + +use std::any::Any; +use std::fmt; +use std::sync::Arc; + +use arrow::array::RecordBatch; +use arrow::compute::cast; +use arrow::datatypes::{DataType, Field, Schema as ArrowSchema, SchemaRef, TimeUnit}; +use async_trait::async_trait; +use datafusion::catalog::{Session, TableProvider}; +use datafusion::common::{DataFusionError, Result}; +use datafusion::execution::TaskContext; +use datafusion::logical_expr::{Expr, TableProviderFilterPushDown, TableType}; +use datafusion::physical_plan::{ + DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, SendableRecordBatchStream, +}; +use datafusion::physical_plan::stream::RecordBatchStreamAdapter; +use datafusion::physical_expr::EquivalenceProperties; +use futures::stream::StreamExt; + +/// Compute the cast-target DataType for an Arrow type not directly readable +/// by Spark's `ArrowColumnVector`. Returns `None` if the type passes through. +pub fn arrow_cast_widening(dt: &DataType) -> Option { + match dt { + DataType::UInt8 => Some(DataType::Int16), + DataType::UInt16 => Some(DataType::Int32), + DataType::UInt32 => Some(DataType::Int64), + // UInt64 → Int64: lossy for values ≥ 2^63. Documented in REARCHITECTURE.md. + DataType::UInt64 => Some(DataType::Int64), + DataType::Float16 => Some(DataType::Float32), + DataType::Time32(_) => Some(DataType::Int32), + DataType::Time64(_) => Some(DataType::Int64), + DataType::Timestamp(unit, tz) => { + if *unit == TimeUnit::Microsecond { + None + } else { + Some(DataType::Timestamp(TimeUnit::Microsecond, tz.clone())) + } + } + DataType::List(field) => arrow_cast_widening(field.data_type()) + .map(|inner| DataType::List(widened_child(field, inner))), + DataType::LargeList(field) => arrow_cast_widening(field.data_type()) + .map(|inner| DataType::LargeList(widened_child(field, inner))), + DataType::FixedSizeList(field, size) => arrow_cast_widening(field.data_type()) + .map(|inner| DataType::FixedSizeList(widened_child(field, inner), *size)), + _ => None, + } +} + +fn widened_child(field: &Arc, new_type: DataType) -> Arc { + Arc::new(Field::new(field.name(), new_type, field.is_nullable())) +} + +/// Build the widened schema by walking inner fields and replacing types. +/// Returns the widened schema plus per-column target types (None where no cast). +fn widened_schema(inner: &ArrowSchema) -> (SchemaRef, Vec>) { + let mut fields = Vec::with_capacity(inner.fields().len()); + let mut targets = Vec::with_capacity(inner.fields().len()); + for f in inner.fields() { + match arrow_cast_widening(f.data_type()) { + Some(target) => { + fields.push(Arc::new(Field::new(f.name(), target.clone(), f.is_nullable()))); + targets.push(Some(target)); + } + None => { + fields.push(Arc::clone(f)); + targets.push(None); + } + } + } + (Arc::new(ArrowSchema::new(fields)), targets) +} + +/// TableProvider wrapping an inner provider, exposing a widened schema and +/// emitting RecordBatches whose columns have been cast to the widened types. +#[derive(Debug)] +pub struct WideningTableProvider { + inner: Arc, + widened: SchemaRef, + /// Targets indexed by the inner-schema column position; `None` = pass through. + targets: Vec>, +} + +impl WideningTableProvider { + pub fn new(inner: Arc) -> Self { + let (widened, targets) = widened_schema(&inner.schema()); + Self { inner, widened, targets } + } +} + +#[async_trait] +impl TableProvider for WideningTableProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + Arc::clone(&self.widened) + } + + fn table_type(&self) -> TableType { + self.inner.table_type() + } + + fn supports_filters_pushdown( + &self, + filters: &[&Expr], + ) -> Result> { + self.inner.supports_filters_pushdown(filters) + } + + async fn scan( + &self, + session: &dyn Session, + projection: Option<&Vec>, + filters: &[Expr], + limit: Option, + ) -> Result> { + let inner_plan = self.inner.scan(session, projection, filters, limit).await?; + let (projected_widened, projected_targets) = match projection { + Some(idxs) => { + let fields: Vec> = + idxs.iter().map(|i| Arc::clone(&self.widened.fields()[*i])).collect(); + let targets: Vec> = + idxs.iter().map(|i| self.targets[*i].clone()).collect(); + (Arc::new(ArrowSchema::new(fields)) as SchemaRef, targets) + } + None => (Arc::clone(&self.widened), self.targets.clone()), + }; + Ok(Arc::new(WideningExec::new( + inner_plan, + projected_widened, + projected_targets, + ))) + } +} + +/// ExecutionPlan that runs the inner plan and casts each output RecordBatch +/// column-wise per the supplied targets. Pure stream-map wrapper; no +/// buffering, no internal state. +pub struct WideningExec { + inner: Arc, + schema: SchemaRef, + /// One entry per output column; `None` = pass through. + targets: Vec>, + properties: Arc, +} + +impl WideningExec { + fn new( + inner: Arc, + schema: SchemaRef, + targets: Vec>, + ) -> Self { + let inner_props = inner.properties(); + let properties = Arc::new(PlanProperties::new( + EquivalenceProperties::new(Arc::clone(&schema)), + inner_props.partitioning.clone(), + inner_props.emission_type, + inner_props.boundedness, + )); + Self { inner, schema, targets, properties } + } +} + +impl fmt::Debug for WideningExec { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("WideningExec") + .field("schema", &self.schema) + .field("targets", &self.targets) + .finish() + } +} + +impl DisplayAs for WideningExec { + fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let cast_count = self.targets.iter().filter(|t| t.is_some()).count(); + write!(f, "WideningExec: casts={cast_count}") + } +} + +impl ExecutionPlan for WideningExec { + fn name(&self) -> &str { + "WideningExec" + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn properties(&self) -> &Arc { + &self.properties + } + + fn children(&self) -> Vec<&Arc> { + vec![&self.inner] + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> Result> { + if children.len() != 1 { + return Err(DataFusionError::Internal( + "WideningExec::with_new_children expects exactly one child".to_string(), + )); + } + Ok(Arc::new(WideningExec::new( + children.into_iter().next().unwrap(), + Arc::clone(&self.schema), + self.targets.clone(), + ))) + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> Result { + let inner_stream = self.inner.execute(partition, context)?; + let schema = Arc::clone(&self.schema); + let targets = self.targets.clone(); + let mapped = inner_stream.map(move |batch_res| match batch_res { + Err(e) => Err(e), + Ok(batch) => cast_batch(&batch, &schema, &targets), + }); + Ok(Box::pin(RecordBatchStreamAdapter::new(self.schema.clone(), mapped))) + } +} + +fn cast_batch( + batch: &RecordBatch, + out_schema: &SchemaRef, + targets: &[Option], +) -> Result { + if batch.num_columns() != targets.len() { + return Err(DataFusionError::Internal(format!( + "WideningExec: produced batch has {} columns, expected {}", + batch.num_columns(), + targets.len() + ))); + } + let mut new_cols = Vec::with_capacity(batch.num_columns()); + for (col, target) in batch.columns().iter().zip(targets.iter()) { + match target { + Some(t) => new_cols.push(cast(col, t).map_err(DataFusionError::from)?), + None => new_cols.push(Arc::clone(col)), + } + } + RecordBatch::try_new(Arc::clone(out_schema), new_cols).map_err(DataFusionError::from) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn unsigned_ints_widen_to_signed_wider() { + assert_eq!(arrow_cast_widening(&DataType::UInt8), Some(DataType::Int16)); + assert_eq!(arrow_cast_widening(&DataType::UInt16), Some(DataType::Int32)); + assert_eq!(arrow_cast_widening(&DataType::UInt32), Some(DataType::Int64)); + assert_eq!(arrow_cast_widening(&DataType::UInt64), Some(DataType::Int64)); + } + + #[test] + fn float16_widens_to_float32() { + assert_eq!(arrow_cast_widening(&DataType::Float16), Some(DataType::Float32)); + } + + #[test] + fn time_widens_to_int() { + assert_eq!( + arrow_cast_widening(&DataType::Time32(TimeUnit::Millisecond)), + Some(DataType::Int32) + ); + assert_eq!( + arrow_cast_widening(&DataType::Time64(TimeUnit::Nanosecond)), + Some(DataType::Int64) + ); + } + + #[test] + fn timestamp_normalizes_unit_preserving_tz() { + let ns = DataType::Timestamp(TimeUnit::Nanosecond, Some(Arc::from("UTC"))); + assert_eq!( + arrow_cast_widening(&ns), + Some(DataType::Timestamp(TimeUnit::Microsecond, Some(Arc::from("UTC")))) + ); + let us_no_tz = DataType::Timestamp(TimeUnit::Microsecond, None); + assert_eq!(arrow_cast_widening(&us_no_tz), None); + } + + #[test] + fn list_recurses_into_children() { + let inner_field = Arc::new(Field::new("item", DataType::UInt16, true)); + let list_ty = DataType::List(inner_field); + let widened = arrow_cast_widening(&list_ty).expect("should widen"); + match widened { + DataType::List(field) => assert_eq!(field.data_type(), &DataType::Int32), + other => panic!("expected List, got {other:?}"), + } + } + + #[test] + fn signed_int_passes_through() { + assert_eq!(arrow_cast_widening(&DataType::Int32), None); + assert_eq!(arrow_cast_widening(&DataType::Utf8), None); + } +} diff --git a/spark/pom.xml b/spark/pom.xml new file mode 100644 index 0000000..82586ff --- /dev/null +++ b/spark/pom.xml @@ -0,0 +1,254 @@ + + + + 4.0.0 + + + org.apache.datafusion + datafusion-java-parent + 0.2.0-SNAPSHOT + + + datafusion-java-spark_2.13 + jar + + Apache DataFusion Java Spark Connector + + Generic Spark DataSource V2 connector that registers DataFusion + TableProviders via FFI. Domain bridges implement FfiProviderFactory; + this module supplies the Spark plumbing, predicate translation, + Arrow-to-Spark schema conversion, and a widening cdylib that wraps + an FFI_TableProvider in a WideningTableProvider before Spark sees + it. + + + + 2.13 + 2.13.14 + 3.5.7 + debug + + + + + org.scala-lang + scala-library + ${scala.version} + + + + org.apache.spark + spark-core_${scala.compat.version} + ${spark.version} + provided + + + org.apache.spark + spark-sql_${scala.compat.version} + ${spark.version} + provided + + + + org.apache.datafusion + datafusion-java + + + + org.apache.arrow + arrow-vector + + + org.apache.arrow + arrow-c-data + + + org.apache.arrow + arrow-memory-netty + runtime + + + + org.scalatest + scalatest_${scala.compat.version} + 3.2.18 + test + + + + + + + net.alchim31.maven + scala-maven-plugin + 4.8.1 + + + + compile + testCompile + + + + + ${scala.version} + + -deprecation + -feature + -unchecked + + all + + + + org.apache.maven.plugins + maven-surefire-plugin + + --add-opens=java.base/java.nio=ALL-UNNAMED + true + + + + org.apache.maven.plugins + maven-antrun-plugin + 3.1.0 + + + copy-widening-cdylib + process-classes + run + + + + + + + + + + + + + + + org.scalatest + scalatest-maven-plugin + 2.2.0 + + ${project.build.directory}/scalatest-reports + . + WDF TestSuite.txt + --add-opens=java.base/java.nio=ALL-UNNAMED + + + + test + test + + + + + + + + + + native-linux-amd64 + + unixlinuxamd64 + + + linux + x86_64 + libdatafusion_spark_helper.so + + + + native-linux-x86_64 + + unixlinuxx86_64 + + + linux + x86_64 + libdatafusion_spark_helper.so + + + + native-linux-aarch64 + + unixlinuxaarch64 + + + linux + aarch64 + libdatafusion_spark_helper.so + + + + native-mac-x86_64 + + macx86_64 + + + darwin + x86_64 + libdatafusion_spark_helper.dylib + + + + native-mac-amd64 + + macamd64 + + + darwin + x86_64 + libdatafusion_spark_helper.dylib + + + + native-mac-aarch64 + + macaarch64 + + + darwin + aarch64 + libdatafusion_spark_helper.dylib + + + + diff --git a/spark/src/main/java/io/datafusion/spark/FfiHelperNative.java b/spark/src/main/java/io/datafusion/spark/FfiHelperNative.java new file mode 100644 index 0000000..8def9d4 --- /dev/null +++ b/spark/src/main/java/io/datafusion/spark/FfiHelperNative.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.datafusion.spark; + +/** + * JNI hooks into the connector-core widening cdylib ({@code + * libdatafusion_spark_helper.{so,dylib}}). + * + *

The widening cdylib unwraps an FFI_TableProvider pointer produced by a bridge, wraps it in a + * {@code WideningTableProvider} that applies kernel-level {@code arrow::compute::cast} on incoming + * RecordBatches for any Spark-incompatible Arrow type (unsigned ints, Float16, Time, + * non-microsecond Timestamp, recursive List), and re-FFIs it for the consumer (datafusion-java's + * cdylib via {@code SessionContext.registerFfiTable}). + * + *

The native library is loaded once per JVM via {@link NativeLibraryLoader}. The library payload + * lives inside this jar under {@code io/datafusion/spark///} and is extracted to a temp + * file before {@link System#load}. + */ +public final class FfiHelperNative { + + private FfiHelperNative() {} + + static { + NativeLibraryLoader.loadLibrary("datafusion_spark_helper"); + } + + /** + * Take ownership of an {@code FFI_TableProvider} pointer produced by a bridge cdylib, wrap it in + * a {@code WideningTableProvider}, and re-wrap the result as a fresh {@code FFI_TableProvider}. + * Returns the new raw pointer; the caller owns it. + * + *

The input pointer must not be reused after this call returns: ownership transfers. + */ + public static native long wrapWithWidening(long ffiProviderRawPtr); +} diff --git a/spark/src/main/java/io/datafusion/spark/FfiProviderFactory.java b/spark/src/main/java/io/datafusion/spark/FfiProviderFactory.java new file mode 100644 index 0000000..1d01f70 --- /dev/null +++ b/spark/src/main/java/io/datafusion/spark/FfiProviderFactory.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.datafusion.spark; + +import java.util.Map; + +/** + * Bridge interface implemented per domain (Rerun, HDF5, custom Iceberg, etc.). A bridge owns its + * own proto schema for connection options and a cdylib that produces an {@code FFI_TableProvider} + * pointer. The connector-core Spark plumbing is generic — it knows only this interface. + * + *

Lifecycle per Spark task: + * + *

    + *
  1. {@link #encodeOptions(Map)} — driver-side, converts the Spark options map into the bridge's + * own proto bytes; ships verbatim through {@code DatafusionInputPartition}. + *
  2. {@link #listPartitions(byte[])} — driver-side, enumerates partition identifiers (e.g. Rerun + * segment ids) so each gets its own Spark task. + *
  3. {@link #createProvider(byte[])} — executor-side, builds the bridge's {@code Arc<dyn + * TableProvider>}, wraps it in an {@code FFI_TableProvider}, returns the raw boxed pointer + * as a {@code jlong}. The caller owns this pointer and is responsible for handing it to + * exactly one consumer (the consumer's {@code Drop} releases it). + *
+ * + *

Implementations must be no-arg constructable so the Spark connector can instantiate them + * reflectively via {@link Class#forName(String)} on the executor. + */ +public interface FfiProviderFactory { + + /** + * Convert Spark's flat option map to the bridge's proto-encoded options. Driver-side only. + * + * @throws IllegalArgumentException if required options are missing or invalid + */ + byte[] encodeOptions(Map sparkOptions); + + /** + * Enumerate partition identifiers for this dataset. One Spark task is created per returned id. + * Driver-side only. + */ + String[] listPartitions(byte[] optionsProtoBytes); + + /** + * Build the underlying {@code Arc} and wrap it in an {@code + * FFI_TableProvider}. Returns the raw {@code Box::into_raw} pointer as a {@code jlong}; the + * caller takes ownership. + */ + long createProvider(byte[] optionsProtoBytes); +} diff --git a/spark/src/main/java/io/datafusion/spark/NativeLibraryLoader.java b/spark/src/main/java/io/datafusion/spark/NativeLibraryLoader.java new file mode 100644 index 0000000..0b9fc22 --- /dev/null +++ b/spark/src/main/java/io/datafusion/spark/NativeLibraryLoader.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.datafusion.spark; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; +import java.util.Locale; + +/** + * Extracts a cdylib bundled inside the connector-core jar to a temp file and loads it via {@link + * System#load}. Layout inside the jar: + * + *

+ *   io/datafusion/spark/<os>/<arch>/lib<name>.<ext>
+ * 
+ * + * where {@code } is one of {@code linux}, {@code darwin}, {@code windows} and {@code } is + * {@code x86_64} or {@code aarch64}. + */ +final class NativeLibraryLoader { + + private NativeLibraryLoader() {} + + static void loadLibrary(String name) { + String resource = + String.format( + "/io/datafusion/spark/%s/%s/%s", + currentOs(), currentArch(), System.mapLibraryName(name)); + try (InputStream in = NativeLibraryLoader.class.getResourceAsStream(resource)) { + if (in == null) { + throw new UnsatisfiedLinkError("Native library not found on classpath: " + resource); + } + Path tmp = Files.createTempFile("libdatafusion-spark-", "-" + System.mapLibraryName(name)); + tmp.toFile().deleteOnExit(); + Files.copy(in, tmp, StandardCopyOption.REPLACE_EXISTING); + System.load(tmp.toAbsolutePath().toString()); + } catch (IOException e) { + throw new UnsatisfiedLinkError( + "Failed to extract native library " + resource + ": " + e.getMessage()); + } + } + + private static String currentOs() { + String os = System.getProperty("os.name", "").toLowerCase(Locale.ROOT); + if (os.contains("linux")) return "linux"; + if (os.contains("mac") || os.contains("darwin")) return "darwin"; + if (os.contains("windows")) return "windows"; + throw new UnsupportedOperationException("Unsupported OS: " + os); + } + + private static String currentArch() { + String arch = System.getProperty("os.arch", "").toLowerCase(Locale.ROOT); + if (arch.equals("amd64") || arch.equals("x86_64")) return "x86_64"; + if (arch.equals("aarch64") || arch.equals("arm64")) return "aarch64"; + throw new UnsupportedOperationException("Unsupported arch: " + arch); + } +} diff --git a/spark/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/spark/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister new file mode 100644 index 0000000..3e612e0 --- /dev/null +++ b/spark/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister @@ -0,0 +1 @@ +io.datafusion.spark.DatafusionSource diff --git a/spark/src/main/scala/io/datafusion/spark/ArrowToSparkSchema.scala b/spark/src/main/scala/io/datafusion/spark/ArrowToSparkSchema.scala new file mode 100644 index 0000000..eac18b6 --- /dev/null +++ b/spark/src/main/scala/io/datafusion/spark/ArrowToSparkSchema.scala @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.datafusion.spark + +import scala.jdk.CollectionConverters._ + +import org.apache.arrow.vector.types.{DateUnit, FloatingPointPrecision, IntervalUnit} +import org.apache.arrow.vector.types.pojo.{ArrowType, Field, Schema} +import org.apache.spark.sql.types._ + +/** + * Arrow Schema → Spark StructType converter. + * + * The reported Spark schema MUST be one whose runtime ArrowColumnVector accessor Spark can pick + * for the underlying Arrow vector. Spark 3.5's `ArrowColumnVector` supports the following + * accessors: Boolean, Byte, Short, Int, Long, Float, Double, Decimal, Date, Timestamp, + * TimestampNTZ, Duration (DayTimeInterval), String, LargeString, Binary, Array, Map, Struct, + * Null. No unsigned-int or Time accessor exists; we surface a clear error at schema discovery + * for those — the alternative is silent corruption. + * + * The widening cdylib (connector-core/native/) inserts a `WideningTableProvider` upstream of the + * Spark reader that casts unsupported types kernel-side (UInt*→signed wider, Float16→Float32, + * non-µs Timestamp→µs Timestamp, Time→Int) so Spark only ever sees compatible Arrow types. + */ +object ArrowToSparkSchema { + + def toSparkSchema(schema: Schema): StructType = + StructType(schema.getFields.asScala.toSeq.map(toSparkField)) + + private def toSparkField(f: Field): StructField = { + val dt = + Option(f.getDictionary) match { + case Some(_) => + unsupported(f, "dictionary-encoded fields (need dictionary value schema in JNI)") + case None => toSparkType(f) + } + StructField(f.getName, dt, f.isNullable) + } + + private def toSparkType(f: Field): DataType = f.getType match { + case _: ArrowType.Bool => BooleanType + + case t: ArrowType.Int => + (t.getBitWidth, t.getIsSigned) match { + case (8, true) => ByteType + case (16, true) => ShortType + case (32, true) => IntegerType + case (64, true) => LongType + case (bits, false) => + unsupported( + f, + s"unsigned integer UInt$bits (Spark ArrowColumnVector has no unsigned accessor; " + + "widening cdylib casts these before Spark sees them — this branch indicates the " + + "WideningTableProvider was bypassed)" + ) + case (bits, signed) => unsupported(f, s"Int(bits=$bits, signed=$signed)") + } + + case t: ArrowType.FloatingPoint => + t.getPrecision match { + case FloatingPointPrecision.HALF => + unsupported(f, "Float16 (widening cdylib must cast to Float32 before Spark)") + case FloatingPointPrecision.SINGLE => FloatType + case FloatingPointPrecision.DOUBLE => DoubleType + case other => unsupported(f, s"FloatingPoint($other)") + } + + case _: ArrowType.Utf8 => StringType + case _: ArrowType.LargeUtf8 => StringType + case _: ArrowType.Binary => BinaryType + case _: ArrowType.LargeBinary => BinaryType + case _: ArrowType.FixedSizeBinary => BinaryType + + case d: ArrowType.Date => + d.getUnit match { + case DateUnit.DAY | DateUnit.MILLISECOND => DateType + case other => unsupported(f, s"Date($other)") + } + + case t: ArrowType.Timestamp => + val _unused = t.getUnit + if (t.getTimezone == null) TimestampNTZType else TimestampType + + case ti: ArrowType.Time => + unsupported( + f, + s"Time(${ti.getUnit}, ${ti.getBitWidth}-bit) — Spark has no time-of-day type" + ) + + case d: ArrowType.Decimal => DecimalType(d.getPrecision, d.getScale) + + case _: ArrowType.Null => NullType + + case _: ArrowType.Duration => DayTimeIntervalType() + + case iv: ArrowType.Interval => + iv.getUnit match { + case IntervalUnit.YEAR_MONTH => YearMonthIntervalType() + case IntervalUnit.DAY_TIME => DayTimeIntervalType() + case IntervalUnit.MONTH_DAY_NANO => + unsupported(f, "Interval(MONTH_DAY_NANO) — no clean Spark equivalent") + } + + case _: ArrowType.Struct => + StructType(f.getChildren.asScala.toSeq.map(toSparkField)) + + case _: ArrowType.List => + val child = f.getChildren.get(0) + ArrayType(toSparkType(child), containsNull = child.isNullable) + case _: ArrowType.LargeList => + val child = f.getChildren.get(0) + ArrayType(toSparkType(child), containsNull = child.isNullable) + case _: ArrowType.FixedSizeList => + val child = f.getChildren.get(0) + ArrayType(toSparkType(child), containsNull = child.isNullable) + + case _: ArrowType.Map => + val entries = f.getChildren.get(0) + val keyValue = entries.getChildren + val keyField = keyValue.get(0) + val valueField = keyValue.get(1) + MapType(toSparkType(keyField), toSparkType(valueField), valueField.isNullable) + + case _: ArrowType.Union => + unsupported(f, "Union (Spark has no equivalent)") + + case other => unsupported(f, s"$other") + } + + private def unsupported(f: Field, detail: String): Nothing = + throw new UnsupportedOperationException( + s"Column '${f.getName}': $detail" + ) +} diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionBatch.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionBatch.scala new file mode 100644 index 0000000..466651a --- /dev/null +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionBatch.scala @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.datafusion.spark + +import org.apache.spark.sql.connector.read.{Batch, InputPartition, PartitionReaderFactory} + +/** + * Spark `Batch` for a DataFusion-backed scan. Owns: + * - partition planning (driver-side: `factory.listPartitions` enumerates partition ids → one + * task per id) + * - per-task reader factory ([[DatafusionPartitionReaderFactory]]) + */ +class DatafusionBatch(val scan: DatafusionScan) extends Batch { + + override def planInputPartitions(): Array[InputPartition] = { + val factory = instantiateFactory(scan.factoryFqcn) + val partitionIds: Array[String] = factory.listPartitions(scan.optionsProtoBytes) + + if (partitionIds == null || partitionIds.isEmpty) { + throw new IllegalStateException( + s"FfiProviderFactory '${scan.factoryFqcn}' returned no partitions to scan" + ) + } + + val projection = scan.prunedSchema.fieldNames + val filterBytes: Array[Array[Byte]] = scan.pushedPredicateBytes + + partitionIds.iterator.map { id => + DatafusionInputPartition( + factoryFqcn = scan.factoryFqcn, + optionsProtoBytes = scan.optionsProtoBytes, + projectionColumnNames = projection, + filterProtoBytes = filterBytes, + partitionId = id + ).asInstanceOf[InputPartition] + }.toArray + } + + override def createReaderFactory(): PartitionReaderFactory = + new DatafusionPartitionReaderFactory(scan.prunedSchema) + + private def instantiateFactory(fqcn: String): FfiProviderFactory = { + val cls = Class.forName(fqcn) + cls.getDeclaredConstructor().newInstance().asInstanceOf[FfiProviderFactory] + } +} diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionColumnarPartitionReader.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionColumnarPartitionReader.scala new file mode 100644 index 0000000..e777f99 --- /dev/null +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionColumnarPartitionReader.scala @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.datafusion.spark + +import org.apache.arrow.memory.RootAllocator +import org.apache.arrow.vector.FieldVector +import org.apache.arrow.vector.ipc.ArrowReader +import org.apache.datafusion.{DataFrame, SessionContext} +import org.apache.spark.sql.connector.read.PartitionReader +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.vectorized.{ColumnVector, ColumnarBatch} + +/** + * Per-task columnar reader. Lifecycle: + * + * 1. Reflectively instantiate the bridge's `FfiProviderFactory` (no-arg). + * 2. `createProvider(optionsProtoBytes)` — bridge builds an `Arc`, wraps it + * in an `FFI_TableProvider`, returns the raw pointer. + * 3. Hand that pointer to connector-core's widening cdylib via `FfiHelperNative.wrapWithWidening`. + * The cdylib wraps the inner provider in a `WideningTableProvider` (kernel-level + * `arrow::compute::cast` for Spark-incompatible Arrow types) and re-FFIs it. + * 4. Register the widened pointer on a fresh `SessionContext` via `registerFfiTable`. + * 5. Build a `SELECT projection FROM ` DataFrame; apply pushed filters via + * `DataFrame.filterFromProto`. + * 6. `executeStream` returns an `ArrowReader`; each `loadNextBatch()` yields a + * `VectorSchemaRoot` we wrap as a `ColumnarBatch` of `NonClosingArrowColumnVector`s. + */ +class DatafusionColumnarPartitionReader( + partition: DatafusionInputPartition, + readSchema: StructType +) extends PartitionReader[ColumnarBatch] { + + private val TableName = "df_spark_partition" + + private val allocator = new RootAllocator(Long.MaxValue) + private val ctx: SessionContext = new SessionContext() + + private val factory: FfiProviderFactory = instantiateFactory(partition.factoryFqcn) + + private val df: DataFrame = { + val rawPtr = factory.createProvider(partition.optionsProtoBytes) + val widenedPtr = FfiHelperNative.wrapWithWidening(rawPtr) + ctx.registerFfiTable(TableName, widenedPtr) + var d = ctx.sql(buildSql()) + var i = 0 + while (i < partition.filterProtoBytes.length) { + d = d.filterFromProto(partition.filterProtoBytes(i)) + i += 1 + } + d + } + private val reader: ArrowReader = df.executeStream(allocator) + + private var currentBatch: ColumnarBatch = _ + + override def next(): Boolean = { + if (currentBatch != null) { + currentBatch = null + } + if (!reader.loadNextBatch()) return false + val root = reader.getVectorSchemaRoot + val vectors: java.util.List[FieldVector] = root.getFieldVectors + val cols = new Array[ColumnVector](vectors.size()) + var i = 0 + while (i < vectors.size()) { + cols(i) = new NonClosingArrowColumnVector(vectors.get(i)) + i += 1 + } + val batch = new ColumnarBatch(cols) + batch.setNumRows(root.getRowCount) + currentBatch = batch + true + } + + override def get(): ColumnarBatch = currentBatch + + override def close(): Unit = { + var first: Throwable = null + def safe(f: => Unit): Unit = + try f + catch { case t: Throwable => if (first == null) first = t else first.addSuppressed(t) } + safe(reader.close()) + safe(ctx.close()) + safe(allocator.close()) + if (first != null) throw first + } + + private def buildSql(): String = { + val cols = + if (partition.projectionColumnNames.isEmpty) "*" + else + partition.projectionColumnNames + .map(c => "\"" + c.replace("\"", "\"\"") + "\"") + .mkString(", ") + s"""SELECT $cols FROM "$TableName"""" + } + + private def instantiateFactory(fqcn: String): FfiProviderFactory = { + val cls = Class.forName(fqcn) + cls.getDeclaredConstructor().newInstance().asInstanceOf[FfiProviderFactory] + } + +} diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionInputPartition.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionInputPartition.scala new file mode 100644 index 0000000..9d66d2b --- /dev/null +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionInputPartition.scala @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.datafusion.spark + +import org.apache.spark.sql.connector.read.InputPartition + +/** + * Per-task payload shipped from driver to executor via Java serialization. + * + * - `factoryFqcn`: fully-qualified class name of the bridge's `FfiProviderFactory`. The + * executor reflectively instantiates this and calls `createProvider(optionsProtoBytes)`. + * - `optionsProtoBytes`: bridge-specific connection options, encoded by the bridge. Opaque to + * connector-core. + * - `projectionColumnNames`: pruned column list (post-`pruneColumns`). + * - `filterProtoBytes`: V2 `Predicate` → DataFusion `LogicalExprNode` proto bytes; each one is + * applied via `DataFrame.filterFromProto`. + * - `partitionId`: stable identifier (e.g. Rerun segment id) — for `preferredLocations` / debug. + */ +final case class DatafusionInputPartition( + factoryFqcn: String, + optionsProtoBytes: Array[Byte], + projectionColumnNames: Array[String], + filterProtoBytes: Array[Array[Byte]], + partitionId: String +) extends InputPartition diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionPartitionReaderFactory.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionPartitionReaderFactory.scala new file mode 100644 index 0000000..2a08fdb --- /dev/null +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionPartitionReaderFactory.scala @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.datafusion.spark + +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, PartitionReaderFactory} +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.vectorized.ColumnarBatch + +/** + * Per-task `PartitionReader` factory. Columnar-only: row-based reads would force the connector + * to convert Arrow → `InternalRow` per row, defeating the zero-copy path that is the whole + * reason we are in-process. + */ +class DatafusionPartitionReaderFactory(val readSchema: StructType) extends PartitionReaderFactory { + + override def supportColumnarReads(partition: InputPartition): Boolean = true + + override def createReader(partition: InputPartition): PartitionReader[InternalRow] = + throw new UnsupportedOperationException( + "DatafusionPartitionReaderFactory: row-based read not supported; consumers must opt into columnar" + ) + + override def createColumnarReader(partition: InputPartition): PartitionReader[ColumnarBatch] = { + val p = partition.asInstanceOf[DatafusionInputPartition] + new DatafusionColumnarPartitionReader(p, readSchema) + } +} diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionScan.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionScan.scala new file mode 100644 index 0000000..90f3cad --- /dev/null +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionScan.scala @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.datafusion.spark + +import org.apache.spark.sql.connector.expressions.filter.Predicate +import org.apache.spark.sql.connector.read.{Batch, Scan} +import org.apache.spark.sql.types.StructType + +/** + * Read plan for a DataFusion-backed scan. Holds pruning state, the pushed predicates (for + * `description()` / `explain(True)`), and the corresponding `LogicalExprNode` proto byte arrays + * the executor applies via `DataFrame.filterFromProto`. + */ +class DatafusionScan( + val factoryFqcn: String, + val optionsProtoBytes: Array[Byte], + val fullSchema: StructType, + val prunedSchema: StructType, + val pushedPredicates: Array[Predicate], + val pushedPredicateBytes: Array[Array[Byte]] +) extends Scan { + + override def readSchema(): StructType = prunedSchema + + override def description(): String = + s"DatafusionScan(factory=$factoryFqcn, projection=${prunedSchema.fieldNames.mkString(",")}," + + s" pushedPredicates=${pushedPredicates.length})" + + override def toBatch: Batch = new DatafusionBatch(this) +} diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionScanBuilder.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionScanBuilder.scala new file mode 100644 index 0000000..63ef8ce --- /dev/null +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionScanBuilder.scala @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.datafusion.spark + +import org.apache.spark.sql.connector.expressions.filter.Predicate +import org.apache.spark.sql.connector.read.{Scan, ScanBuilder, SupportsPushDownRequiredColumns, SupportsPushDownV2Filters} +import org.apache.spark.sql.types.StructType + +/** + * ScanBuilder with V2 Predicate pushdown + column pruning. Every translatable predicate is + * marked Exact and dropped from Spark's post-scan Filter; the rest stay residual. + * + * Pushdown discipline: over-claiming Exact = wrong results, under-claiming = full scans. The + * translator (see [[SparkPredicateTranslator]]) only emits proto for predicates it can encode + * losslessly — anything else returns `None` and lands in residuals. + */ +class DatafusionScanBuilder( + factoryFqcn: String, + optionsProtoBytes: Array[Byte], + fullSchema: StructType +) extends ScanBuilder + with SupportsPushDownV2Filters + with SupportsPushDownRequiredColumns { + + private var pushed: Array[Predicate] = Array.empty + private var pushedBytes: Array[Array[Byte]] = Array.empty + private var pruned: StructType = fullSchema + + override def pushPredicates(predicates: Array[Predicate]): Array[Predicate] = { + val pushedBuf = scala.collection.mutable.ArrayBuffer[Predicate]() + val bytesBuf = scala.collection.mutable.ArrayBuffer[Array[Byte]]() + val residual = scala.collection.mutable.ArrayBuffer[Predicate]() + + var i = 0 + while (i < predicates.length) { + val p = predicates(i) + SparkPredicateTranslator.translate(p) match { + case Some(node) => + pushedBuf += p + bytesBuf += node.toByteArray + case None => + residual += p + } + i += 1 + } + pushed = pushedBuf.toArray + pushedBytes = bytesBuf.toArray + residual.toArray + } + + override def pushedPredicates(): Array[Predicate] = pushed + + override def pruneColumns(requiredSchema: StructType): Unit = { + pruned = requiredSchema + } + + override def build(): Scan = + new DatafusionScan(factoryFqcn, optionsProtoBytes, fullSchema, pruned, pushed, pushedBytes) +} diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionSource.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionSource.scala new file mode 100644 index 0000000..629060e --- /dev/null +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionSource.scala @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.datafusion.spark + +import java.util + +import org.apache.datafusion.SessionContext +import org.apache.spark.sql.connector.catalog.{Table, TableProvider} +import org.apache.spark.sql.connector.expressions.Transform +import org.apache.spark.sql.sources.DataSourceRegister +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.util.CaseInsensitiveStringMap + +/** + * Generic Spark DataSource V2 entry point. Concrete bridges either: + * - Subclass and override [[shortName]] + [[factoryFqcn]] (the rerun-connector pattern), or + * - Use this class directly with `option("df.factory", "fully.qualified.FactoryClass")`. + * + * Schema discovery happens driver-side via a transient SessionContext: the factory's + * `FFI_TableProvider` is built, wrapped with the widening cdylib, registered on the context, and + * its Arrow schema read via `tableSchema(name)`. The same `optionsProtoBytes` (and the factory + * FQCN) is then carried verbatim through `DatafusionInputPartition`, so each executor task + * repeats the same factory → wrapWithWidening → registerFfiTable pipeline locally. + */ +class DatafusionSource extends TableProvider with DataSourceRegister { + + override def shortName(): String = "datafusion" + + /** Spark option key carrying the FfiProviderFactory FQCN when no override is provided. */ + protected val FactoryOptionKey: String = "df.factory" + + /** + * Resolve the bridge factory class name from the Spark options. Subclasses (e.g. + * `RerunDataSource`) override to return a hard-coded FQCN so users don't need to set + * `df.factory` themselves. + */ + protected def factoryFqcn(options: CaseInsensitiveStringMap): String = { + val v = options.get(FactoryOptionKey) + if (v == null || v.isEmpty) + throw new IllegalArgumentException( + s"DatafusionSource: option '$FactoryOptionKey' is required when no subclass override is set" + ) + v + } + + override def inferSchema(options: CaseInsensitiveStringMap): StructType = { + val fqcn = factoryFqcn(options) + val factory = instantiateFactory(fqcn) + val optionsBytes = factory.encodeOptions(options.asCaseSensitiveMap()) + val arrowSchema = { + val ctx = new SessionContext() + try { + val rawPtr = factory.createProvider(optionsBytes) + val widenedPtr = FfiHelperNative.wrapWithWidening(rawPtr) + ctx.registerFfiTable("__df_schema_probe__", widenedPtr) + ctx.tableSchema("__df_schema_probe__") + } finally ctx.close() + } + ArrowToSparkSchema.toSparkSchema(arrowSchema) + } + + override def getTable( + schema: StructType, + partitioning: Array[Transform], + properties: util.Map[String, String] + ): Table = { + val options = new CaseInsensitiveStringMap(properties) + val fqcn = factoryFqcn(options) + val factory = instantiateFactory(fqcn) + val optionsBytes = factory.encodeOptions(options.asCaseSensitiveMap()) + new DatafusionTable(fqcn, optionsBytes, schema) + } + + override def supportsExternalMetadata(): Boolean = false + + private def instantiateFactory(fqcn: String): FfiProviderFactory = { + val cls = Class.forName(fqcn) + cls.getDeclaredConstructor().newInstance().asInstanceOf[FfiProviderFactory] + } +} diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionTable.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionTable.scala new file mode 100644 index 0000000..31a55d2 --- /dev/null +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionTable.scala @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.datafusion.spark + +import java.util + +import org.apache.spark.sql.connector.catalog.{SupportsRead, Table, TableCapability} +import org.apache.spark.sql.connector.read.ScanBuilder +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.util.CaseInsensitiveStringMap + +/** + * Read-only DataFusion-backed table. Capabilities advertise only batch read. + */ +class DatafusionTable( + val factoryFqcn: String, + val optionsProtoBytes: Array[Byte], + val sparkSchema: StructType +) extends Table + with SupportsRead { + + override def name(): String = s"datafusion.${factoryFqcn.split('.').last}" + + override def schema(): StructType = sparkSchema + + override def capabilities(): util.Set[TableCapability] = { + val caps = new util.HashSet[TableCapability]() + caps.add(TableCapability.BATCH_READ) + caps + } + + override def newScanBuilder(scanOpts: CaseInsensitiveStringMap): ScanBuilder = + new DatafusionScanBuilder(factoryFqcn, optionsProtoBytes, sparkSchema) +} diff --git a/spark/src/main/scala/io/datafusion/spark/NonClosingArrowColumnVector.scala b/spark/src/main/scala/io/datafusion/spark/NonClosingArrowColumnVector.scala new file mode 100644 index 0000000..4fa74bd --- /dev/null +++ b/spark/src/main/scala/io/datafusion/spark/NonClosingArrowColumnVector.scala @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.datafusion.spark + +import org.apache.arrow.vector.FieldVector +import org.apache.spark.sql.vectorized.ArrowColumnVector + +/** + * `ArrowColumnVector` whose `close()` is a no-op. The `ArrowReader`'s `VectorSchemaRoot` owns + * the underlying `ValueVector` lifecycles across `loadNextBatch()` calls; closing them per Spark + * batch would break the next read. Lifecycle is centralised in + * `DatafusionColumnarPartitionReader.close()`. + */ +final class NonClosingArrowColumnVector(vec: FieldVector) extends ArrowColumnVector(vec) { + override def close(): Unit = { /* intentional no-op */ } +} diff --git a/spark/src/main/scala/io/datafusion/spark/SparkPredicateTranslator.scala b/spark/src/main/scala/io/datafusion/spark/SparkPredicateTranslator.scala new file mode 100644 index 0000000..3092914 --- /dev/null +++ b/spark/src/main/scala/io/datafusion/spark/SparkPredicateTranslator.scala @@ -0,0 +1,214 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.datafusion.spark + +import datafusion_common.DatafusionCommon.{Column, ScalarValue} +import org.apache.datafusion.protobuf.{ + BinaryExprNode, + InListNode, + IsNotNull, + IsNull, + LikeNode, + LogicalExprNode, + Not => NotNode +} +import org.apache.spark.sql.connector.expressions.{Expression, Literal, NamedReference} +import org.apache.spark.sql.connector.expressions.filter.Predicate + +/** + * Translate Spark V2 `Predicate` → DataFusion `LogicalExprNode` proto. Only emits expressions + * that the producer can apply EXACTLY — anything else returns `None` and the caller marks the + * predicate as residual so Spark re-applies it above the scan. + */ +object SparkPredicateTranslator { + + def translate(p: Predicate): Option[LogicalExprNode] = p.name() match { + case "=" => binary(p, "Eq") + case "<>" => binary(p, "NotEq") + case "<" => binary(p, "Lt") + case "<=" => binary(p, "LtEq") + case ">" => binary(p, "Gt") + case ">=" => binary(p, "GtEq") + case "IS_NULL" => unary(p, "IsNull") + case "IS_NOT_NULL" => unary(p, "IsNotNull") + case "AND" => combine(p, "And") + case "OR" => combine(p, "Or") + case "NOT" => translateNot(p) + case "IN" => translateIn(p) + case "STARTS_WITH" => like(p, prefix = false, suffix = true) + case "ENDS_WITH" => like(p, prefix = true, suffix = false) + case "CONTAINS" => like(p, prefix = true, suffix = true) + case _ => None + } + + private def binary(p: Predicate, op: String): Option[LogicalExprNode] = { + val cs = p.children() + if (cs.length != 2) return None + val left = expr(cs(0)) + val right = expr(cs(1)) + if (left.isEmpty || right.isEmpty) return None + Some( + LogicalExprNode + .newBuilder() + .setBinaryExpr( + BinaryExprNode + .newBuilder() + .addOperands(left.get) + .addOperands(right.get) + .setOp(op) + .build() + ) + .build() + ) + } + + private def unary(p: Predicate, op: String): Option[LogicalExprNode] = { + val cs = p.children() + if (cs.length != 1) return None + val inner = expr(cs(0)) + if (inner.isEmpty) return None + val builder = LogicalExprNode.newBuilder() + op match { + case "IsNull" => builder.setIsNullExpr(IsNull.newBuilder().setExpr(inner.get).build()) + case "IsNotNull" => + builder.setIsNotNullExpr(IsNotNull.newBuilder().setExpr(inner.get).build()) + case _ => return None + } + Some(builder.build()) + } + + private def combine(p: Predicate, op: String): Option[LogicalExprNode] = { + val cs = p.children() + if (cs.length != 2) return None + val (l, r) = (cs(0), cs(1)) + if (!l.isInstanceOf[Predicate] || !r.isInstanceOf[Predicate]) return None + val ln = translate(l.asInstanceOf[Predicate]) + val rn = translate(r.asInstanceOf[Predicate]) + if (ln.isEmpty || rn.isEmpty) return None + Some( + LogicalExprNode + .newBuilder() + .setBinaryExpr( + BinaryExprNode + .newBuilder() + .addOperands(ln.get) + .addOperands(rn.get) + .setOp(op) + .build() + ) + .build() + ) + } + + private def translateNot(p: Predicate): Option[LogicalExprNode] = { + val cs = p.children() + if (cs.length != 1 || !cs(0).isInstanceOf[Predicate]) return None + val inner = translate(cs(0).asInstanceOf[Predicate]) + if (inner.isEmpty) return None + Some(LogicalExprNode.newBuilder().setNotExpr(NotNode.newBuilder().setExpr(inner.get).build()).build()) + } + + private def translateIn(p: Predicate): Option[LogicalExprNode] = { + val cs = p.children() + if (cs.length < 2) return None + val target = expr(cs(0)) + if (target.isEmpty) return None + val values = new java.util.ArrayList[LogicalExprNode]() + var i = 1 + while (i < cs.length) { + val v = expr(cs(i)) + if (v.isEmpty) return None + values.add(v.get) + i += 1 + } + val node = InListNode + .newBuilder() + .setExpr(target.get) + .addAllList(values) + .setNegated(false) + .build() + Some(LogicalExprNode.newBuilder().setInList(node).build()) + } + + private def like(p: Predicate, prefix: Boolean, suffix: Boolean): Option[LogicalExprNode] = { + val cs = p.children() + if (cs.length != 2) return None + val target = expr(cs(0)) + val pat = cs(1) match { + case lit: Literal[_] => + val raw = lit.value() match { + case s: String => Some(s) + case u: org.apache.spark.unsafe.types.UTF8String => Some(u.toString) + case _ => None + } + raw.map { r => + val escaped = r.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_") + (if (prefix) "%" else "") + escaped + (if (suffix) "%" else "") + } + case _ => None + } + if (target.isEmpty || pat.isEmpty) return None + val patternExpr = stringLiteral(pat.get) + val like = LikeNode + .newBuilder() + .setExpr(target.get) + .setPattern(patternExpr) + .setNegated(false) + .setEscapeChar("\\") + .build() + Some(LogicalExprNode.newBuilder().setLike(like).build()) + } + + private def expr(e: Expression): Option[LogicalExprNode] = e match { + case nr: NamedReference => + val parts = nr.fieldNames() + if (parts.length != 1) None + else + Some( + LogicalExprNode + .newBuilder() + .setColumn(Column.newBuilder().setName(parts(0)).build()) + .build() + ) + case lit: Literal[_] => literal(lit.value()) + case _ => None + } + + private def literal(v: Any): Option[LogicalExprNode] = { + val sv = ScalarValue.newBuilder() + val ok: Boolean = v match { + case b: java.lang.Boolean => sv.setBoolValue(b.booleanValue()); true + case b: java.lang.Byte => sv.setInt8Value(b.intValue()); true + case s: java.lang.Short => sv.setInt16Value(s.intValue()); true + case i: java.lang.Integer => sv.setInt32Value(i.intValue()); true + case l: java.lang.Long => sv.setInt64Value(l.longValue()); true + case f: java.lang.Float => sv.setFloat32Value(f.floatValue()); true + case d: java.lang.Double => sv.setFloat64Value(d.doubleValue()); true + case s: String => sv.setUtf8Value(s); true + case u: org.apache.spark.unsafe.types.UTF8String => sv.setUtf8Value(u.toString); true + case _ => false + } + if (!ok) None + else Some(LogicalExprNode.newBuilder().setLiteral(sv.build()).build()) + } + + private def stringLiteral(s: String): LogicalExprNode = + LogicalExprNode.newBuilder().setLiteral(ScalarValue.newBuilder().setUtf8Value(s).build()).build() +} diff --git a/spark/src/test/scala/io/datafusion/spark/ArrowToSparkSchemaTest.scala b/spark/src/test/scala/io/datafusion/spark/ArrowToSparkSchemaTest.scala new file mode 100644 index 0000000..2b59601 --- /dev/null +++ b/spark/src/test/scala/io/datafusion/spark/ArrowToSparkSchemaTest.scala @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.datafusion.spark + +import java.util.Collections + +import org.apache.arrow.vector.types.{DateUnit, FloatingPointPrecision, IntervalUnit, TimeUnit} +import org.apache.arrow.vector.types.pojo.{ArrowType, Field, FieldType, Schema} +import org.apache.spark.sql.types._ +import org.scalatest.funsuite.AnyFunSuite + +class ArrowToSparkSchemaTest extends AnyFunSuite { + + private def primField(name: String, t: ArrowType, nullable: Boolean = true): Field = + new Field(name, new FieldType(nullable, t, /*dict=*/ null), Collections.emptyList()) + + test("signed ints map to matching Spark int types") { + val arrow = new Schema( + java.util.Arrays.asList( + primField("i8", new ArrowType.Int(8, true)), + primField("i16", new ArrowType.Int(16, true)), + primField("i32", new ArrowType.Int(32, true)), + primField("i64", new ArrowType.Int(64, true)) + ) + ) + val s = ArrowToSparkSchema.toSparkSchema(arrow) + assert(s.fields(0).dataType == ByteType) + assert(s.fields(1).dataType == ShortType) + assert(s.fields(2).dataType == IntegerType) + assert(s.fields(3).dataType == LongType) + } + + test("unsigned ints are rejected with a clear error") { + val arrow = new Schema( + java.util.Arrays.asList(primField("u32", new ArrowType.Int(32, false))) + ) + val ex = intercept[UnsupportedOperationException](ArrowToSparkSchema.toSparkSchema(arrow)) + assert(ex.getMessage.contains("u32")) + assert(ex.getMessage.toLowerCase.contains("unsigned")) + } + + test("timestamps split on timezone presence") { + val withTz = primField("t_utc", new ArrowType.Timestamp(TimeUnit.MICROSECOND, "UTC")) + val noTz = primField("t_local", new ArrowType.Timestamp(TimeUnit.MICROSECOND, null)) + val s = ArrowToSparkSchema.toSparkSchema( + new Schema(java.util.Arrays.asList(withTz, noTz)) + ) + assert(s.fields(0).dataType == TimestampType) + assert(s.fields(1).dataType == TimestampNTZType) + } + + test("decimal preserves precision and scale") { + val s = ArrowToSparkSchema.toSparkSchema( + new Schema(java.util.Arrays.asList(primField("d", new ArrowType.Decimal(18, 4, 128)))) + ) + assert(s.fields(0).dataType == DecimalType(18, 4)) + } + + test("Time and Float16 are rejected (no Spark accessor)") { + intercept[UnsupportedOperationException] { + ArrowToSparkSchema.toSparkSchema( + new Schema(java.util.Arrays.asList(primField("t", new ArrowType.Time(TimeUnit.MICROSECOND, 64)))) + ) + } + intercept[UnsupportedOperationException] { + ArrowToSparkSchema.toSparkSchema( + new Schema(java.util.Arrays.asList(primField("h", new ArrowType.FloatingPoint(FloatingPointPrecision.HALF)))) + ) + } + } + + test("list element nullability propagates") { + val child = + new Field( + "el", + new FieldType(/*nullable=*/ true, new ArrowType.Int(32, true), null), + Collections.emptyList() + ) + val listField = new Field( + "xs", + new FieldType(true, new ArrowType.List(), null), + java.util.Arrays.asList(child) + ) + val s = ArrowToSparkSchema.toSparkSchema( + new Schema(java.util.Arrays.asList(listField)) + ) + assert(s.fields(0).dataType == ArrayType(IntegerType, containsNull = true)) + } +} diff --git a/spark/src/test/scala/io/datafusion/spark/SparkPredicateTranslatorTest.scala b/spark/src/test/scala/io/datafusion/spark/SparkPredicateTranslatorTest.scala new file mode 100644 index 0000000..b7faac1 --- /dev/null +++ b/spark/src/test/scala/io/datafusion/spark/SparkPredicateTranslatorTest.scala @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.datafusion.spark + +import org.apache.datafusion.protobuf.LogicalExprNode +import org.apache.spark.sql.connector.expressions.{Expression, Expressions, NamedReference} +import org.apache.spark.sql.connector.expressions.filter.Predicate +import org.scalatest.funsuite.AnyFunSuite + +class SparkPredicateTranslatorTest extends AnyFunSuite { + + private def col(name: String): NamedReference = Expressions.column(name) + private def litInt(v: Int): Expression = Expressions.literal(Int.box(v)) + private def litLong(v: Long): Expression = Expressions.literal(Long.box(v)) + private def litStr(v: String): Expression = + Expressions.literal(org.apache.spark.unsafe.types.UTF8String.fromString(v)) + + test("LessThan(timeline, 1_000_000) translates to a non-empty proto") { + val p = new Predicate("<", Array[Expression](col("timeline"), litLong(1000000L))) + val node = SparkPredicateTranslator.translate(p).getOrElse(fail("expected Some")) + val bytes = node.toByteArray + assert(bytes.nonEmpty) + val parsed = LogicalExprNode.parseFrom(bytes) + assert(parsed.hasBinaryExpr) + assert(parsed.getBinaryExpr.getOp == "Lt") + } + + test("AND of two translatable predicates round-trips through binary op 'And'") { + val lt = new Predicate("<", Array[Expression](col("a"), litInt(10))) + val eq = new Predicate("=", Array[Expression](col("b"), litStr("x"))) + val and = new Predicate("AND", Array[Expression](lt, eq)) + val node = SparkPredicateTranslator.translate(and).getOrElse(fail("expected Some")) + val parsed = LogicalExprNode.parseFrom(node.toByteArray) + assert(parsed.hasBinaryExpr) + assert(parsed.getBinaryExpr.getOp == "And") + } + + test("AND becomes residual when an operand is untranslatable") { + val nse = new Predicate("<=>", Array[Expression](col("a"), litInt(1))) + val eq = new Predicate("=", Array[Expression](col("b"), litInt(2))) + val and = new Predicate("AND", Array[Expression](nse, eq)) + assert(SparkPredicateTranslator.translate(and).isEmpty) + } + + test("IS_NULL and IS_NOT_NULL emit the dedicated proto variants") { + val isNull = new Predicate("IS_NULL", Array[Expression](col("x"))) + val isNotNull = new Predicate("IS_NOT_NULL", Array[Expression](col("x"))) + val n1 = SparkPredicateTranslator.translate(isNull).getOrElse(fail()).toByteArray + val n2 = SparkPredicateTranslator.translate(isNotNull).getOrElse(fail()).toByteArray + val p1 = LogicalExprNode.parseFrom(n1) + val p2 = LogicalExprNode.parseFrom(n2) + assert(p1.hasIsNullExpr) + assert(p2.hasIsNotNullExpr) + } + + test("STARTS_WITH translates to a LIKE with a '%' suffix") { + val p = + new Predicate("STARTS_WITH", Array[Expression](col("name"), litStr("foo"))) + val node = SparkPredicateTranslator.translate(p).getOrElse(fail()) + val parsed = LogicalExprNode.parseFrom(node.toByteArray) + assert(parsed.hasLike) + val patStr = parsed.getLike.getPattern.getLiteral.getUtf8Value + assert(patStr == "foo%") + } + + test("unknown predicate name returns None (becomes residual)") { + val p = new Predicate("UNKNOWN_OP", Array[Expression](col("x"), litInt(1))) + assert(SparkPredicateTranslator.translate(p).isEmpty) + } +} From 7ee7c9d4d12fe1a86779f814d7c79a4d770cdc3f Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 10 Jun 2026 16:24:04 +0200 Subject: [PATCH 04/22] refactor(build): consolidate Rust crates into a Cargo workspace The three Rust crates (`native`, `examples/native`, `spark/native`) now live under one root Cargo workspace with all dependencies declared in `[workspace.dependencies]`. Members reference shared deps via `{ workspace = true }`; per-crate flags (`optional`, extra `features`) stay at the use site. Only one `Cargo.lock` to maintain. Cargo writes to `rust-target/` (overridden via `.cargo/config.toml`) so `mvn clean` at the repo root does not nuke the Rust build cache. Maven antrun copies, Makefile, GitHub Actions caches, examples README, and the `FfiTableProviderExampleNative` lookup paths all repoint there. Co-Authored-By: Claude Opus 4.7 (1M context) --- .cargo/config.toml | 21 + .github/workflows/build.yml | 4 +- .github/workflows/lint.yml | 8 +- .gitignore | 1 + native/Cargo.lock => Cargo.lock | 201 +- Cargo.toml | 49 + Makefile | 10 +- core/pom.xml | 4 +- .../org/apache/datafusion/SessionContext.java | 11 +- .../SessionContextRuntimeStatsTest.java | 2 +- .../SessionContextSubstraitTest.java | 2 +- docs/source/contributor-guide/development.md | 18 +- .../updating-datafusion-version.md | 10 +- examples/README.md | 15 +- examples/native/Cargo.lock | 3653 ---------------- examples/native/Cargo.toml | 10 +- .../examples/FfiTableProviderExample.java | 2 +- .../FfiTableProviderExampleNative.java | 22 +- native/Cargo.toml | 36 +- native/src/runtime_metrics.rs | 2 +- pom.xml | 7 +- spark/native/Cargo.lock | 3655 ----------------- spark/native/Cargo.toml | 14 +- spark/pom.xml | 4 +- 24 files changed, 276 insertions(+), 7485 deletions(-) create mode 100644 .cargo/config.toml rename native/Cargo.lock => Cargo.lock (96%) create mode 100644 Cargo.toml delete mode 100644 examples/native/Cargo.lock delete mode 100644 spark/native/Cargo.lock diff --git a/.cargo/config.toml b/.cargo/config.toml new file mode 100644 index 0000000..d7e0ee2 --- /dev/null +++ b/.cargo/config.toml @@ -0,0 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Keep Cargo's workspace output out of `target/` so `mvn clean` (which deletes +# the root `target/`) does not nuke the Rust build cache. +[build] +target-dir = "rust-target" diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index c5db936..da8e65a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -83,8 +83,8 @@ jobs: path: | ~/.cargo/registry ~/.cargo/git - native/target - key: ${{ runner.os }}-cargo-${{ hashFiles('native/Cargo.lock') }} + rust-target + key: ${{ runner.os }}-cargo-${{ hashFiles('Cargo.lock') }} restore-keys: ${{ runner.os }}-cargo- - name: Build native and run tests diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 4cf628f..952bf34 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -54,7 +54,7 @@ jobs: run: ./mvnw -q spotless:check - name: Check Rust formatting - run: cd native && cargo fmt --all -- --check + run: cargo fmt --all -- --check clippy: name: Clippy @@ -81,9 +81,9 @@ jobs: path: | ~/.cargo/registry ~/.cargo/git - native/target - key: ${{ runner.os }}-clippy-${{ hashFiles('native/Cargo.lock') }} + rust-target + key: ${{ runner.os }}-clippy-${{ hashFiles('Cargo.lock') }} restore-keys: ${{ runner.os }}-clippy- - name: Run clippy - run: cd native && cargo clippy --all-targets -- -D warnings + run: cargo clippy --workspace --all-targets -- -D warnings diff --git a/.gitignore b/.gitignore index 719a2a4..25c9216 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ target/ +rust-target/ *.class .idea/ .vscode/ diff --git a/native/Cargo.lock b/Cargo.lock similarity index 96% rename from native/Cargo.lock rename to Cargo.lock index 93b2d0e..a6a4204 100644 --- a/native/Cargo.lock +++ b/Cargo.lock @@ -146,9 +146,9 @@ dependencies = [ [[package]] name = "ar_archive_writer" -version = "0.5.1" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7eb93bbb63b9c227414f6eb3a0adfddca591a8ce1e9b60661bb08969b87e340b" +checksum = "4087686b4b0a3427190bae57a1d9a478dbb2d40c5dc1bd6e2b6d797913bdd348" dependencies = [ "object", ] @@ -462,9 +462,9 @@ checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" [[package]] name = "autocfg" -version = "1.5.0" +version = "1.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +checksum = "f2032f911046de80f0a198e0901378627c33f59ea0ac00e363d481118bd70a53" [[package]] name = "base64" @@ -488,9 +488,9 @@ dependencies = [ [[package]] name = "bitflags" -version = "2.11.1" +version = "2.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" +checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8" [[package]] name = "blake2" @@ -526,9 +526,9 @@ dependencies = [ [[package]] name = "bon" -version = "3.9.1" +version = "3.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f47dbe92550676ee653353c310dfb9cf6ba17ee70396e1f7cf0a2020ad49b2fe" +checksum = "b2f04f6fef12d70d42a77b1433c9e0f065238479a6cefc4f5bab105e9873a3c3" dependencies = [ "bon-macros", "rustversion", @@ -536,9 +536,9 @@ dependencies = [ [[package]] name = "bon-macros" -version = "3.9.1" +version = "3.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "519bd3116aeeb42d5372c29d982d16d0170d3d4a5ed85fc7dd91642ffff3c67c" +checksum = "7d0bd4c2f75335ad98052a37efb54f428b492f64340257143b3429c8a508fa7b" dependencies = [ "darling", "ident_case", @@ -551,9 +551,9 @@ dependencies = [ [[package]] name = "brotli" -version = "8.0.2" +version = "8.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bd8b9603c7aa97359dbd97ecf258968c95f3adddd6db2f7e7a5bef101c84560" +checksum = "8119e4516436f5708bbc474a9d395bf12f1b5395e93a92a56e647ac3388c8610" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -562,9 +562,9 @@ dependencies = [ [[package]] name = "brotli-decompressor" -version = "5.0.0" +version = "5.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "874bb8112abecc98cbd6d81ea4fa7e94fb9449648c93cc89aa40c81c24d7de03" +checksum = "5962523e1b92ce1b5e793d9169b9943eece10d39f62550bc04bb605d75b94924" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -572,9 +572,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.20.2" +version = "3.20.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" +checksum = "72f5acc6cb2ba439de613abc23857ec3d78374d8ed5ac84e9d11336e87da8649" [[package]] name = "byteorder" @@ -599,9 +599,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.62" +version = "1.2.63" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1dce859f0832a7d088c4f1119888ab94ef4b5d6795d1ce05afb7fe159d79f98" +checksum = "556e016178bb5662a08681bbe0f00f8e17631781a4dfc8c45e466e4b185ec27f" dependencies = [ "find-msvc-tools", "jobserver", @@ -640,9 +640,9 @@ dependencies = [ [[package]] name = "chrono" -version = "0.4.44" +version = "0.4.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0" +checksum = "1aa79e62e7697b8e29b513a68abacf485adcd1fe8284a4316c5ae868e6633327" dependencies = [ "iana-time-zone", "num-traits", @@ -891,9 +891,9 @@ dependencies = [ [[package]] name = "dashmap" -version = "6.1.0" +version = "6.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" +checksum = "e6361d5c062261c78a176addb82d4c821ae42bed6089de0e12603cd25de2059c" dependencies = [ "cfg-if", "crossbeam-utils", @@ -1431,6 +1431,17 @@ dependencies = [ "datafusion-physical-expr-common", ] +[[package]] +name = "datafusion-java-ffi-example" +version = "0.1.0" +dependencies = [ + "arrow", + "datafusion", + "datafusion-ffi", + "jni", + "tokio", +] + [[package]] name = "datafusion-jni" version = "0.1.0" @@ -1660,6 +1671,19 @@ dependencies = [ "parking_lot", ] +[[package]] +name = "datafusion-spark-helper" +version = "0.1.0" +dependencies = [ + "arrow", + "async-trait", + "datafusion", + "datafusion-ffi", + "futures", + "jni", + "tokio", +] + [[package]] name = "datafusion-sql" version = "53.1.0" @@ -1712,9 +1736,9 @@ dependencies = [ [[package]] name = "displaydoc" -version = "0.2.5" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +checksum = "1ac70aa55017e108007fbaf5aa0f54b021c98f92ff8af59d42eda9da96e3dd4f" dependencies = [ "proc-macro2", "quote", @@ -1729,9 +1753,9 @@ checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555" [[package]] name = "either" -version = "1.15.0" +version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e" [[package]] name = "equivalent" @@ -2046,9 +2070,9 @@ checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" [[package]] name = "http" -version = "1.4.0" +version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3ba2a386d7f85a81f119ad7498ebe444d2e22c2af0b86b069416ace48b3311a" +checksum = "6970f50e31d6fc17d3fa27329444bfa74e196cf62e95052a3f6fee181dba6425" dependencies = [ "bytes", "itoa", @@ -2091,9 +2115,9 @@ checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424" [[package]] name = "hyper" -version = "1.9.0" +version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6299f016b246a94207e63da54dbe807655bf9e00044f73ded42c3ac5305fbcca" +checksum = "55281c53a1894c864990125767da440a4e630446785086f52523b20033b74498" dependencies = [ "atomic-waker", "bytes", @@ -2383,13 +2407,12 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.98" +version = "0.3.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67df7112613f8bfd9150013a0314e196f4800d3201ae742489d999db2f979f08" +checksum = "f2025f20d7a4fa7785846e7b63d10a76d3f1cee98ee5cb79ea59703f95e42162" dependencies = [ "cfg-if", "futures-util", - "once_cell", "wasm-bindgen", ] @@ -2458,9 +2481,9 @@ dependencies = [ [[package]] name = "libbz2-rs-sys" -version = "0.2.3" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3a6a8c165077efc8f3a971534c50ea6a1a18b329ef4a66e897a7e3a1494565f" +checksum = "34b357333733e8260735ba5894eb928c02ecc69c78715f01a8019e7fa7f2db4c" [[package]] name = "libc" @@ -2527,9 +2550,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.29" +version = "0.4.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" +checksum = "953f07c43838f8e6f9758cab68bf5bed85465e7587ebe0b823f1bcd81978ad3a" [[package]] name = "lru-slab" @@ -2558,9 +2581,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.8.0" +version = "2.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8" [[package]] name = "miniz_oxide" @@ -2574,9 +2597,9 @@ dependencies = [ [[package]] name = "mio" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50b7e5b27aa02a74bac8c3f23f448f8d87ff11f92d3aac1a6ed369ee08cc56c1" +checksum = "02bd0af71c67b473010cbbc60715ee815645a4dc942899111f494b4b737d6fda" dependencies = [ "libc", "wasi", @@ -2722,9 +2745,9 @@ dependencies = [ [[package]] name = "parquet" -version = "58.2.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43d7efd3052f7d6ef601085559a246bc991e9a8cc77e02753737df6322ce35f1" +checksum = "5dafa7d01085b62a47dd0c1829550a0a36710ea9c4fe358a05a85477cec8a908" dependencies = [ "ahash", "arrow-array", @@ -2886,9 +2909,9 @@ dependencies = [ [[package]] name = "prost" -version = "0.14.3" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2ea70524a2f82d518bce41317d0fae74151505651af45faf1ffbd6fd33f0568" +checksum = "528ac67416ff8646872a3c02cad9cc4ee5dc9f9540c9b10771855c95cb2e5ae1" dependencies = [ "bytes", "prost-derive", @@ -2896,9 +2919,9 @@ dependencies = [ [[package]] name = "prost-build" -version = "0.14.3" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7" +checksum = "03da047801ff44bb6a4d407d4860c05fd70bb81714e6b2f3812603d5b145b042" dependencies = [ "heck", "itertools", @@ -2915,9 +2938,9 @@ dependencies = [ [[package]] name = "prost-derive" -version = "0.14.3" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" +checksum = "b570b25f7617e43d59005d0990ccb79e950a423952cea19671b7a876da390adf" dependencies = [ "anyhow", "itertools", @@ -2928,9 +2951,9 @@ dependencies = [ [[package]] name = "prost-types" -version = "0.14.3" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8991c4cbdb8bc5b11f0b074ffe286c30e523de90fee5ba8132f1399f23cb3dd7" +checksum = "f94967dc7688f3054c7fac87473ffae4cc4c3904800e2d9f5b857246d8963b0a" dependencies = [ "prost", ] @@ -3187,9 +3210,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.12.3" +version = "1.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +checksum = "f1292b7759ae1cb9ec195452d1390a074f0cd8541ab7a5a8c31cd6db45d4a6ba" dependencies = [ "aho-corasick", "memchr", @@ -3216,9 +3239,9 @@ checksum = "cab834c73d247e67f4fae452806d17d3c7501756d98c8808d7c9c7aa7d18f973" [[package]] name = "regex-syntax" -version = "0.8.10" +version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" +checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4" [[package]] name = "regress" @@ -3339,9 +3362,9 @@ dependencies = [ [[package]] name = "rustls-native-certs" -version = "0.8.3" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63" +checksum = "dab5152771c58876a2146916e53e35057e1a4dfa2b9df0f0305b07f611fdea4d" dependencies = [ "openssl-probe", "rustls-pki-types", @@ -3522,9 +3545,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.149" +version = "1.0.150" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9" dependencies = [ "itoa", "memchr", @@ -3583,9 +3606,9 @@ dependencies = [ [[package]] name = "shlex" -version = "1.3.0" +version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +checksum = "f8fadd59c855ef2080decdef8ff161eb6661b86933c9d82e5ba29dc602a55aba" [[package]] name = "simd-adler32" @@ -3625,9 +3648,9 @@ checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" [[package]] name = "socket2" -version = "0.6.3" +version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e" +checksum = "52d1cfed4120b4d927bf7c0f86d2087a4a7d6027c906d9f9d525a80573b9be51" dependencies = [ "libc", "windows-sys 0.61.2", @@ -4054,9 +4077,9 @@ checksum = "6af6ae20167a9ece4bcb41af5b80f8a1f1df981f6391189ce00fd257af04126a" [[package]] name = "typenum" -version = "1.20.0" +version = "1.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40ce102ab67701b8526c123c1bab5cbe42d7040ccfd0f64af1a385808d2f43de" +checksum = "b6f5e870be6c3b371b77fe0ee0bafb859fa4964b4404c27de1d380043c4dda20" [[package]] name = "typewit" @@ -4119,9 +4142,9 @@ checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" [[package]] name = "unicode-segmentation" -version = "1.13.2" +version = "1.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9629274872b2bfaf8d66f5f15725007f635594914870f65218920345aa11aa8c" +checksum = "c6f5d3c3b1bf09027a88a6bc961fc00497d651009560b5463668dc81b0fa87a8" [[package]] name = "unicode-width" @@ -4167,9 +4190,9 @@ checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" [[package]] name = "uuid" -version = "1.23.1" +version = "1.23.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddd74a9687298c6858e9b88ec8935ec45d22e8fd5e6394fa1bd4e99a87789c76" +checksum = "144d6b123cef80b301b8f72a9e2ca4370ddec21950d0a103dd22c437006d2db7" dependencies = [ "getrandom 0.4.2", "js-sys", @@ -4228,9 +4251,9 @@ dependencies = [ [[package]] name = "wasm-bindgen" -version = "0.2.121" +version = "0.2.123" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49ace1d07c165b0864824eee619580c4689389afa9dc9ed3a4c75040d82e6790" +checksum = "a254a4b10c19a76f09a27640e7ffbf9bc30bf67e16a3bf28aaefa4920fe81563" dependencies = [ "cfg-if", "once_cell", @@ -4241,9 +4264,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.71" +version = "0.4.73" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96492d0d3ffba25305a7dc88720d250b1401d7edca02cc3bcd50633b424673b8" +checksum = "54568702fabf5d4849ce2b90fadfa64168a097eaf4b351ce9df8b687a0086aaf" dependencies = [ "js-sys", "wasm-bindgen", @@ -4251,9 +4274,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.121" +version = "0.2.123" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e68e6f4afd367a562002c05637acb8578ff2dea1943df76afb9e83d177c8578" +checksum = "24a40fc75b0ec6f3746ceb10d36f53a93dcd68a93b11b6445983945d79eba0dc" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -4261,9 +4284,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.121" +version = "0.2.123" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d95a9ec35c64b2a7cb35d3fead40c4238d0940c86d107136999567a4703259f2" +checksum = "908f34bd9b9ce3d4caf07b72dfab63d61504d156856c6bd3cd87fa350cf3985b" dependencies = [ "bumpalo", "proc-macro2", @@ -4274,9 +4297,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.121" +version = "0.2.123" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4e0100b01e9f0d03189a92b96772a1fb998639d981193d7dbab487302513441" +checksum = "7acbf7616c27b194bbb550bf77ed0c2c3e5b7fd1260a93082b95fb7f47959b92" dependencies = [ "unicode-ident", ] @@ -4330,9 +4353,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.98" +version = "0.3.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b572dff8bcf38bad0fa19729c89bb5748b2b9b1d8be70cf90df697e3a8f32aa" +checksum = "6e0871acf327f283dc6da28a1696cdc64fb355ba9f935d052021fa77f35cce69" dependencies = [ "js-sys", "wasm-bindgen", @@ -4762,9 +4785,9 @@ checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" [[package]] name = "yoke" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abe8c5fda708d9ca3df187cae8bfb9ceda00dd96231bed36e445a1a48e66f9ca" +checksum = "709fe23a0424b6a435d82152b1bd3fdfb0833487d5fa90d05d42762a9891fef5" dependencies = [ "stable_deref_trait", "yoke-derive", @@ -4785,18 +4808,18 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.48" +version = "0.8.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9" +checksum = "ce1022995ff5ff5d841ad7d994facc23098cd40152f2c1d11cd607c6f530653f" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.48" +version = "0.8.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" +checksum = "1ae7f38b72ec2a254e2b87ef277cf2cd4fb97cbebf944faa6f33354da0867930" dependencies = [ "proc-macro2", "quote", @@ -4805,9 +4828,9 @@ dependencies = [ [[package]] name = "zerofrom" -version = "0.1.7" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69faa1f2a1ea75661980b013019ed6687ed0e83d069bc1114e2cc74c6c04c4df" +checksum = "0ec05a11813ea801ff6d75110ad09cd0824ddba17dfe17128ea0d5f68e6c5272" dependencies = [ "zerofrom-derive", ] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..d582098 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,49 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[workspace] +resolver = "2" +members = [ + "native", + "examples/native", + "spark/native", +] + +# Every dependency used by any workspace member is declared here so version +# bumps live in one place and the resolver picks a single version of each +# crate across the workspace. Members reference these via `{ workspace = true }` +# and add per-crate flags (optional, features, default-features) at the use +# site. +[workspace.dependencies] +arrow = { version = "58", features = ["ffi"] } +async-trait = "0.1" +datafusion = { version = "53.1.0" } +datafusion-ffi = "53.1.0" +datafusion-proto = "53.1.0" +datafusion-substrait = "53.1.0" +futures = "0.3" +jni = "0.21" +# Pinned to the major DataFusion 53.1 pulls in transitively (0.13.x) so we +# share the same `dyn ObjectStore` vtable and don't double-link. +object_store = { version = "0.13", default-features = false } +prost = "0.14" +prost-build = "0.14" +protoc-bin-vendored = "3" +tokio = { version = "1", features = ["rt-multi-thread"] } +# Optional, cfg-gated. See `native/Cargo.toml` for the build-flag dance. +tokio-metrics = "0.5" +url = "2" diff --git a/Makefile b/Makefile index 6d9b0ae..d6bcf2c 100644 --- a/Makefile +++ b/Makefile @@ -20,14 +20,14 @@ all: native jvm native: - cd native && cargo build + cargo build --workspace -# Build the native crate with the `runtime-metrics` Cargo feature enabled. +# Build the JNI crate with the `runtime-metrics` Cargo feature enabled. # Requires `--cfg tokio_unstable` because tokio-metrics gates its API there. # Default `make native` does not pull this in; callers who need # SessionContext.runtimeStats() pick this target explicitly. native-runtime-metrics: - cd native && RUSTFLAGS="--cfg tokio_unstable" cargo build --features runtime-metrics + RUSTFLAGS="--cfg tokio_unstable" cargo build -p datafusion-jni --features runtime-metrics jvm: ./mvnw package -DskipTests @@ -39,10 +39,10 @@ test: native # `:check` form inline in .github/workflows/lint.yml. format: ./mvnw -q spotless:apply - cd native && cargo fmt --all + cargo fmt --all clean: - cd native && cargo clean + cargo clean ./mvnw clean tpch-data: diff --git a/core/pom.xml b/core/pom.xml index 5ddf107..1e25736 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -102,8 +102,8 @@ under the License. - + value="${maven.multiModuleProjectDirectory}/rust-target/${datafusion.native.profile}/${datafusion.lib.filename}"/> + diff --git a/core/src/main/java/org/apache/datafusion/SessionContext.java b/core/src/main/java/org/apache/datafusion/SessionContext.java index 86140a0..ea56d80 100644 --- a/core/src/main/java/org/apache/datafusion/SessionContext.java +++ b/core/src/main/java/org/apache/datafusion/SessionContext.java @@ -113,10 +113,11 @@ public DataFrame fromProto(byte[] planBytes) { * other Substrait-emitting tool — and hand them to DataFusion without round-tripping through SQL. * *

Substrait support is gated behind the {@code substrait} Cargo feature on the native crate - * and is off by default. Rebuild the native crate with {@code cargo build - * --features substrait} (or {@code cargo build --features substrait,protoc} for hermetic builds - * that vendor {@code protoc} via {@code cmake}) to enable it. If invoked against a native binary - * built without the feature, this method throws {@link RuntimeException} pointing at the flag. + * and is off by default. Rebuild the native crate with {@code cargo build -p + * datafusion-jni --features substrait} (or {@code ... --features substrait,protoc} for hermetic + * builds that vendor {@code protoc} via {@code cmake}) to enable it. If invoked against a native + * binary built without the feature, this method throws {@link RuntimeException} pointing at the + * flag. * * @throws IllegalArgumentException if {@code planBytes} is {@code null}. * @throws IllegalStateException if this context is closed. @@ -183,7 +184,7 @@ public MemoryUsage memoryUsage() { * Rebuild with: * *

{@code
-   * RUSTFLAGS="--cfg tokio_unstable" cargo build --features runtime-metrics
+   * RUSTFLAGS="--cfg tokio_unstable" cargo build -p datafusion-jni --features runtime-metrics
    * }
* *

If invoked against a native binary built without the feature, this method throws {@link diff --git a/core/src/test/java/org/apache/datafusion/SessionContextRuntimeStatsTest.java b/core/src/test/java/org/apache/datafusion/SessionContextRuntimeStatsTest.java index 120d179..d567275 100644 --- a/core/src/test/java/org/apache/datafusion/SessionContextRuntimeStatsTest.java +++ b/core/src/test/java/org/apache/datafusion/SessionContextRuntimeStatsTest.java @@ -37,7 +37,7 @@ * #checkFeatureEnabled}. Run * *

{@code
- * (cd native && RUSTFLAGS="--cfg tokio_unstable" cargo build --features runtime-metrics)
+ * RUSTFLAGS="--cfg tokio_unstable" cargo build -p datafusion-jni --features runtime-metrics
  * }
* * before {@code ./mvnw test} to exercise this class. diff --git a/core/src/test/java/org/apache/datafusion/SessionContextSubstraitTest.java b/core/src/test/java/org/apache/datafusion/SessionContextSubstraitTest.java index 34db3b5..a2cfb0a 100644 --- a/core/src/test/java/org/apache/datafusion/SessionContextSubstraitTest.java +++ b/core/src/test/java/org/apache/datafusion/SessionContextSubstraitTest.java @@ -50,7 +50,7 @@ * *

The {@code substrait} Cargo feature is off by default in {@code native/Cargo.toml}; if the * native crate was built without it, every test here is skipped (see {@link #checkFeatureEnabled}). - * Run {@code (cd native && cargo build --features substrait)} before {@code ./mvnw test} to + * Run {@code cargo build -p datafusion-jni --features substrait} before {@code ./mvnw test} to * exercise this class. */ class SessionContextSubstraitTest { diff --git a/docs/source/contributor-guide/development.md b/docs/source/contributor-guide/development.md index 984d77c..9eba9a5 100644 --- a/docs/source/contributor-guide/development.md +++ b/docs/source/contributor-guide/development.md @@ -42,7 +42,7 @@ This builds the native Rust crate and runs the JUnit tests. The steps can be run individually: ```sh -cd native && cargo build +cargo build --workspace ./mvnw test ``` @@ -74,14 +74,22 @@ disk space. The repository is a multi-module Maven build: -- `pom.xml` — parent POM declaring the `core` and `examples` modules and - shared plugin/dependency versions. +- `Cargo.toml` — Rust workspace root declaring the three crate members + (`native`, `examples/native`, `spark/native`) and `[workspace.dependencies]` + that pin shared versions in one place. Cargo writes artifacts to + `rust-target/` (overridden in `.cargo/config.toml`) so `mvn clean` at the + repo root does not nuke the Rust build cache. +- `pom.xml` — parent POM declaring the `core`, `spark`, and `examples` + modules and shared plugin/dependency versions. - `core/` — `datafusion-java` library module (Java sources, tests, and generated protobuf classes). +- `spark/` — `datafusion-java-spark` Spark DataSource V2 connector + (Scala + Java) and its `spark/native/` widening cdylib crate. - `examples/` — `datafusion-java-examples` module containing runnable examples that depend on the library; built alongside the library so they - cannot fall out of sync with the API. -- `native/` — Rust crate (JNI + Arrow C Data Interface). + cannot fall out of sync with the API. Includes `examples/native/`, a + small FFI table-provider cdylib used by `FfiTableProviderExample`. +- `native/` — `datafusion-jni` Rust crate (JNI + Arrow C Data Interface). - `proto/` — Protobuf definitions shared between Java and Rust. - `Makefile` — top-level build orchestration (`make test`, `make format`, `make tpch-data`). diff --git a/docs/source/contributor-guide/updating-datafusion-version.md b/docs/source/contributor-guide/updating-datafusion-version.md index 56d50dc..ef6cd10 100644 --- a/docs/source/contributor-guide/updating-datafusion-version.md +++ b/docs/source/contributor-guide/updating-datafusion-version.md @@ -21,7 +21,9 @@ under the License. Three things must move together when bumping DataFusion: -1. `native/Cargo.toml` — the `datafusion` crate dependency. +1. `Cargo.toml` (workspace root) — the `datafusion`, `datafusion-ffi`, + `datafusion-proto`, and `datafusion-substrait` entries in + `[workspace.dependencies]`. Members inherit from there. 2. `pom.xml` — the `` Maven property. **Must equal the Cargo version**; a mismatch means JVM-built protobuf plans won't deserialize on the native side. @@ -32,9 +34,9 @@ Three things must move together when bumping DataFusion: ## Recipe ```sh -# 1. Bump the Cargo dep -$EDITOR native/Cargo.toml # set datafusion = "" -(cd native && cargo update -p datafusion) +# 1. Bump the workspace dep +$EDITOR Cargo.toml # set datafusion = "" in [workspace.dependencies] +cargo update -p datafusion # 2. Bump the Maven property to match $EDITOR pom.xml # set diff --git a/examples/README.md b/examples/README.md index beb7763..d2c2e5b 100644 --- a/examples/README.md +++ b/examples/README.md @@ -37,21 +37,20 @@ add `-Dmaven.repo.local=/path/to/repo` to BOTH invocations.) ## Building the FFI example's cdylib The `FfiTableProviderExample` relies on a small Rust cdylib under -[`native/`](native/) — built independently from the main `datafusion-jni` -crate: +[`native/`](native/). It is a member of the repo-root Cargo workspace, so +build it by name from anywhere in the tree: ```bash -cd examples/native -cargo build --release +cargo build -p datafusion-java-ffi-example --release ``` The example's `System.load` searches the following paths in order: 1. `-Dexample.ffi.lib.path=/abs/path/to/lib...` (explicit override) -2. `examples/native/target/release/` (Maven's cwd is the repo root) -3. `examples/native/target/debug/` -4. `native/target/release/` (cwd inside the `examples` module) -5. `native/target/debug/` +2. `rust-target/release/` (Maven's cwd is the repo root) +3. `rust-target/debug/` +4. `../rust-target/release/` (cwd inside the `examples` module) +5. `../rust-target/debug/` Where `` is `libdatafusion_java_ffi_example.so` on Linux, `libdatafusion_java_ffi_example.dylib` on macOS, or diff --git a/examples/native/Cargo.lock b/examples/native/Cargo.lock deleted file mode 100644 index 0f5ee7a..0000000 --- a/examples/native/Cargo.lock +++ /dev/null @@ -1,3653 +0,0 @@ -# This file is automatically @generated by Cargo. -# It is not intended for manual editing. -version = 4 - -[[package]] -name = "abi_stable" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69d6512d3eb05ffe5004c59c206de7f99c34951504056ce23fc953842f12c445" -dependencies = [ - "abi_stable_derive", - "abi_stable_shared", - "const_panic", - "core_extensions", - "crossbeam-channel", - "generational-arena", - "libloading", - "lock_api", - "parking_lot", - "paste", - "repr_offset", - "rustc_version", - "serde", - "serde_derive", - "serde_json", -] - -[[package]] -name = "abi_stable_derive" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7178468b407a4ee10e881bc7a328a65e739f0863615cca4429d43916b05e898" -dependencies = [ - "abi_stable_shared", - "as_derive_utils", - "core_extensions", - "proc-macro2", - "quote", - "rustc_version", - "syn 1.0.109", - "typed-arena", -] - -[[package]] -name = "abi_stable_shared" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2b5df7688c123e63f4d4d649cba63f2967ba7f7861b1664fca3f77d3dad2b63" -dependencies = [ - "core_extensions", -] - -[[package]] -name = "adler2" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" - -[[package]] -name = "ahash" -version = "0.8.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" -dependencies = [ - "cfg-if", - "const-random", - "getrandom 0.3.4", - "once_cell", - "version_check", - "zerocopy", -] - -[[package]] -name = "aho-corasick" -version = "1.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" -dependencies = [ - "memchr", -] - -[[package]] -name = "alloc-no-stdlib" -version = "2.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3" - -[[package]] -name = "alloc-stdlib" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece" -dependencies = [ - "alloc-no-stdlib", -] - -[[package]] -name = "allocator-api2" -version = "0.2.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" - -[[package]] -name = "android_system_properties" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" -dependencies = [ - "libc", -] - -[[package]] -name = "anyhow" -version = "1.0.102" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" - -[[package]] -name = "ar_archive_writer" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4087686b4b0a3427190bae57a1d9a478dbb2d40c5dc1bd6e2b6d797913bdd348" -dependencies = [ - "object", -] - -[[package]] -name = "arrayref" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" - -[[package]] -name = "arrayvec" -version = "0.7.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" - -[[package]] -name = "arrow" -version = "58.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "378530e55cd479eda3c14eb345310799717e6f76d0c332041e8487022166b471" -dependencies = [ - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-csv", - "arrow-data", - "arrow-ipc", - "arrow-json", - "arrow-ord", - "arrow-row", - "arrow-schema", - "arrow-select", - "arrow-string", -] - -[[package]] -name = "arrow-arith" -version = "58.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0ab212d2c1886e802f51c5212d78ebbcbb0bec980fff9dadc1eb8d45cd0b738" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "chrono", - "num-traits", -] - -[[package]] -name = "arrow-array" -version = "58.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfd33d3e92f207444098c75b42de99d329562be0cf686b307b097cc52b4e999e" -dependencies = [ - "ahash", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "chrono", - "chrono-tz", - "half", - "hashbrown 0.17.1", - "num-complex", - "num-integer", - "num-traits", -] - -[[package]] -name = "arrow-buffer" -version = "58.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c6cd424c2693bcdbc150d843dc9d4d137dd2de4782ce6df491ad11a3a0416c0" -dependencies = [ - "bytes", - "half", - "num-bigint", - "num-traits", -] - -[[package]] -name = "arrow-cast" -version = "58.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c5aefb56a2c02e9e2b30746241058b85f8983f0fcff2ba0c6d09006e1cded7f" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-ord", - "arrow-schema", - "arrow-select", - "atoi", - "base64", - "chrono", - "comfy-table", - "half", - "lexical-core", - "num-traits", - "ryu", -] - -[[package]] -name = "arrow-csv" -version = "58.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e94e8cf7e517657a52b91ea1263acf38c4ca62a84655d72458a3359b12ab97de" -dependencies = [ - "arrow-array", - "arrow-cast", - "arrow-schema", - "chrono", - "csv", - "csv-core", - "regex", -] - -[[package]] -name = "arrow-data" -version = "58.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c88210023a2bfee1896af366309a3028fc3bcbd6515fa29a7990ee1baa08ee0" -dependencies = [ - "arrow-buffer", - "arrow-schema", - "half", - "num-integer", - "num-traits", -] - -[[package]] -name = "arrow-ipc" -version = "58.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "238438f0834483703d88896db6fe5a7138b2230debc31b34c0336c2996e3c64f" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", - "flatbuffers", - "lz4_flex", - "zstd", -] - -[[package]] -name = "arrow-json" -version = "58.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "205ca2119e6d679d5c133c6f30e68f027738d95ed948cf77677ea69c7800036b" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-ord", - "arrow-schema", - "arrow-select", - "chrono", - "half", - "indexmap", - "itoa", - "lexical-core", - "memchr", - "num-traits", - "ryu", - "serde_core", - "serde_json", - "simdutf8", -] - -[[package]] -name = "arrow-ord" -version = "58.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bffd8fd2579286a5d63bac898159873e5094a79009940bcb42bbfce4f19f1d0" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", -] - -[[package]] -name = "arrow-row" -version = "58.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bab5994731204603c73ba69267616c50f80780774c6bb0476f1f830625115e0c" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "half", -] - -[[package]] -name = "arrow-schema" -version = "58.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f633dbfdf39c039ada1bf9e34c694816eb71fbb7dc78f613993b7245e078a1ed" -dependencies = [ - "bitflags", - "serde_core", - "serde_json", -] - -[[package]] -name = "arrow-select" -version = "58.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8cd065c54172ac787cf3f2f8d4107e0d3fdc26edba76fdf4f4cc170258942222" -dependencies = [ - "ahash", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "num-traits", -] - -[[package]] -name = "arrow-string" -version = "58.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29dd7cda3ab9692f43a2e4acc444d760cc17b12bb6d8232ddf64e9bab7c06b42" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", - "memchr", - "num-traits", - "regex", - "regex-syntax", -] - -[[package]] -name = "as_derive_utils" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff3c96645900a44cf11941c111bd08a6573b0e2f9f69bc9264b179d8fae753c4" -dependencies = [ - "core_extensions", - "proc-macro2", - "quote", - "syn 1.0.109", -] - -[[package]] -name = "async-compression" -version = "0.4.42" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e79b3f8a79cccc2898f31920fc69f304859b3bd567490f75ebf51ae1c792a9ac" -dependencies = [ - "compression-codecs", - "compression-core", - "pin-project-lite", - "tokio", -] - -[[package]] -name = "async-ffi" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4de21c0feef7e5a556e51af767c953f0501f7f300ba785cc99c47bdc8081a50" -dependencies = [ - "abi_stable", -] - -[[package]] -name = "async-trait" -version = "0.1.89" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "atoi" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" -dependencies = [ - "num-traits", -] - -[[package]] -name = "autocfg" -version = "1.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2032f911046de80f0a198e0901378627c33f59ea0ac00e363d481118bd70a53" - -[[package]] -name = "base64" -version = "0.22.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" - -[[package]] -name = "bigdecimal" -version = "0.4.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d6867f1565b3aad85681f1015055b087fcfd840d6aeee6eee7f2da317603695" -dependencies = [ - "autocfg", - "libm", - "num-bigint", - "num-integer", - "num-traits", -] - -[[package]] -name = "bitflags" -version = "2.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8" - -[[package]] -name = "blake2" -version = "0.10.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46502ad458c9a52b69d4d4d32775c788b7a1b85e8bc9d482d92250fc0e3f8efe" -dependencies = [ - "digest", -] - -[[package]] -name = "blake3" -version = "1.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0aa83c34e62843d924f905e0f5c866eb1dd6545fc4d719e803d9ba6030371fce" -dependencies = [ - "arrayref", - "arrayvec", - "cc", - "cfg-if", - "constant_time_eq", - "cpufeatures 0.3.0", -] - -[[package]] -name = "block-buffer" -version = "0.10.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" -dependencies = [ - "generic-array", -] - -[[package]] -name = "brotli" -version = "8.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8119e4516436f5708bbc474a9d395bf12f1b5395e93a92a56e647ac3388c8610" -dependencies = [ - "alloc-no-stdlib", - "alloc-stdlib", - "brotli-decompressor", -] - -[[package]] -name = "brotli-decompressor" -version = "5.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5962523e1b92ce1b5e793d9169b9943eece10d39f62550bc04bb605d75b94924" -dependencies = [ - "alloc-no-stdlib", - "alloc-stdlib", -] - -[[package]] -name = "bumpalo" -version = "3.20.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72f5acc6cb2ba439de613abc23857ec3d78374d8ed5ac84e9d11336e87da8649" - -[[package]] -name = "byteorder" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" - -[[package]] -name = "bytes" -version = "1.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" - -[[package]] -name = "bzip2" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3a53fac24f34a81bc9954b5d6cfce0c21e18ec6959f44f56e8e90e4bb7c346c" -dependencies = [ - "libbz2-rs-sys", -] - -[[package]] -name = "cc" -version = "1.2.63" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "556e016178bb5662a08681bbe0f00f8e17631781a4dfc8c45e466e4b185ec27f" -dependencies = [ - "find-msvc-tools", - "jobserver", - "libc", - "shlex", -] - -[[package]] -name = "cesu8" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c" - -[[package]] -name = "cfg-if" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" - -[[package]] -name = "chrono" -version = "0.4.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1aa79e62e7697b8e29b513a68abacf485adcd1fe8284a4316c5ae868e6633327" -dependencies = [ - "iana-time-zone", - "num-traits", - "windows-link", -] - -[[package]] -name = "chrono-tz" -version = "0.10.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6139a8597ed92cf816dfb33f5dd6cf0bb93a6adc938f11039f371bc5bcd26c3" -dependencies = [ - "chrono", - "phf", -] - -[[package]] -name = "combine" -version = "4.6.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba5a308b75df32fe02788e748662718f03fde005016435c444eea572398219fd" -dependencies = [ - "bytes", - "memchr", -] - -[[package]] -name = "comfy-table" -version = "7.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "958c5d6ecf1f214b4c2bbbbf6ab9523a864bd136dcf71a7e8904799acfe1ad47" -dependencies = [ - "unicode-segmentation", - "unicode-width", -] - -[[package]] -name = "compression-codecs" -version = "0.4.38" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce2548391e9c1929c21bf6aa2680af86fe4c1b33e6cea9ac1cfeec0bd11218cf" -dependencies = [ - "bzip2", - "compression-core", - "flate2", - "liblzma", - "memchr", - "zstd", - "zstd-safe", -] - -[[package]] -name = "compression-core" -version = "0.4.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc14f565cf027a105f7a44ccf9e5b424348421a1d8952a8fc9d499d313107789" - -[[package]] -name = "const-random" -version = "0.1.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359" -dependencies = [ - "const-random-macro", -] - -[[package]] -name = "const-random-macro" -version = "0.1.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" -dependencies = [ - "getrandom 0.2.17", - "once_cell", - "tiny-keccak", -] - -[[package]] -name = "const_panic" -version = "0.2.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e262cdaac42494e3ae34c43969f9cdeb7da178bdb4b66fa6a1ea2edb4c8ae652" -dependencies = [ - "typewit", -] - -[[package]] -name = "constant_time_eq" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" - -[[package]] -name = "core-foundation-sys" -version = "0.8.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" - -[[package]] -name = "core_extensions" -version = "1.5.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42bb5e5d0269fd4f739ea6cedaf29c16d81c27a7ce7582008e90eb50dcd57003" -dependencies = [ - "core_extensions_proc_macros", -] - -[[package]] -name = "core_extensions_proc_macros" -version = "1.5.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "533d38ecd2709b7608fb8e18e4504deb99e9a72879e6aa66373a76d8dc4259ea" - -[[package]] -name = "cpufeatures" -version = "0.2.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" -dependencies = [ - "libc", -] - -[[package]] -name = "cpufeatures" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201" -dependencies = [ - "libc", -] - -[[package]] -name = "crc32fast" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" -dependencies = [ - "cfg-if", -] - -[[package]] -name = "crossbeam-channel" -version = "0.5.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2" -dependencies = [ - "crossbeam-utils", -] - -[[package]] -name = "crossbeam-utils" -version = "0.8.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" - -[[package]] -name = "crunchy" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" - -[[package]] -name = "crypto-common" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" -dependencies = [ - "generic-array", - "typenum", -] - -[[package]] -name = "csv" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938" -dependencies = [ - "csv-core", - "itoa", - "ryu", - "serde_core", -] - -[[package]] -name = "csv-core" -version = "0.1.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782" -dependencies = [ - "memchr", -] - -[[package]] -name = "dashmap" -version = "6.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6361d5c062261c78a176addb82d4c821ae42bed6089de0e12603cd25de2059c" -dependencies = [ - "cfg-if", - "crossbeam-utils", - "hashbrown 0.14.5", - "lock_api", - "once_cell", - "parking_lot_core", -] - -[[package]] -name = "datafusion" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93db0e623840612f7f2cd757f7e8a8922064192363732c88692e0870016e141b" -dependencies = [ - "arrow", - "arrow-schema", - "async-trait", - "bytes", - "bzip2", - "chrono", - "datafusion-catalog", - "datafusion-catalog-listing", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-datasource", - "datafusion-datasource-arrow", - "datafusion-datasource-csv", - "datafusion-datasource-json", - "datafusion-datasource-parquet", - "datafusion-execution", - "datafusion-expr", - "datafusion-expr-common", - "datafusion-functions", - "datafusion-functions-aggregate", - "datafusion-functions-nested", - "datafusion-functions-table", - "datafusion-functions-window", - "datafusion-optimizer", - "datafusion-physical-expr", - "datafusion-physical-expr-adapter", - "datafusion-physical-expr-common", - "datafusion-physical-optimizer", - "datafusion-physical-plan", - "datafusion-session", - "datafusion-sql", - "flate2", - "futures", - "itertools", - "liblzma", - "log", - "object_store", - "parking_lot", - "parquet", - "rand", - "regex", - "sqlparser", - "tempfile", - "tokio", - "url", - "uuid", - "zstd", -] - -[[package]] -name = "datafusion-catalog" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37cefde60b26a7f4ff61e9d2ff2833322f91df2b568d7238afe67bde5bdffb66" -dependencies = [ - "arrow", - "async-trait", - "dashmap", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-physical-expr", - "datafusion-physical-plan", - "datafusion-session", - "futures", - "itertools", - "log", - "object_store", - "parking_lot", - "tokio", -] - -[[package]] -name = "datafusion-catalog-listing" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17e112307715d6a7a331111a4c2330ff54bc237183511c319e3708a4cff431fb" -dependencies = [ - "arrow", - "async-trait", - "datafusion-catalog", - "datafusion-common", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-physical-expr", - "datafusion-physical-expr-adapter", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "futures", - "itertools", - "log", - "object_store", -] - -[[package]] -name = "datafusion-common" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d72a11ca44a95e1081870d3abb80c717496e8a7acb467a1d3e932bb636af5cc2" -dependencies = [ - "ahash", - "arrow", - "arrow-ipc", - "chrono", - "half", - "hashbrown 0.16.1", - "indexmap", - "itertools", - "libc", - "log", - "object_store", - "parquet", - "paste", - "recursive", - "sqlparser", - "tokio", - "web-time", -] - -[[package]] -name = "datafusion-common-runtime" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89f4afaed29670ec4fd6053643adc749fe3f4bc9d1ce1b8c5679b22c67d12def" -dependencies = [ - "futures", - "log", - "tokio", -] - -[[package]] -name = "datafusion-datasource" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9fb386e1691355355a96419978a0022b7947b44d4a24a6ea99f00b6b485cbb6" -dependencies = [ - "arrow", - "async-compression", - "async-trait", - "bytes", - "bzip2", - "chrono", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-execution", - "datafusion-expr", - "datafusion-physical-expr", - "datafusion-physical-expr-adapter", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-session", - "flate2", - "futures", - "glob", - "itertools", - "liblzma", - "log", - "object_store", - "rand", - "tokio", - "tokio-util", - "url", - "zstd", -] - -[[package]] -name = "datafusion-datasource-arrow" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ffa6c52cfed0734c5f93754d1c0175f558175248bf686c944fb05c373e5fc096" -dependencies = [ - "arrow", - "arrow-ipc", - "async-trait", - "bytes", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-session", - "futures", - "itertools", - "object_store", - "tokio", -] - -[[package]] -name = "datafusion-datasource-csv" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "503f29e0582c1fc189578d665ff57d9300da1f80c282777d7eb67bb79fb8cdca" -dependencies = [ - "arrow", - "async-trait", - "bytes", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-session", - "futures", - "object_store", - "regex", - "tokio", -] - -[[package]] -name = "datafusion-datasource-json" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e33804749abc8d0c8cb7473228483cb8070e524c6f6086ee1b85a64debe2b3d2" -dependencies = [ - "arrow", - "async-trait", - "bytes", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-session", - "futures", - "object_store", - "serde_json", - "tokio", - "tokio-stream", -] - -[[package]] -name = "datafusion-datasource-parquet" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a8e0365e0e08e8ff94d912f0ababcf9065a1a304018ba90b1fc83c855b4997" -dependencies = [ - "arrow", - "async-trait", - "bytes", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-functions-aggregate-common", - "datafusion-physical-expr", - "datafusion-physical-expr-adapter", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-pruning", - "datafusion-session", - "futures", - "itertools", - "log", - "object_store", - "parking_lot", - "parquet", - "tokio", -] - -[[package]] -name = "datafusion-doc" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8de6ac0df1662b9148ad3c987978b32cbec7c772f199b1d53520c8fa764a87ee" - -[[package]] -name = "datafusion-execution" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c03c7fbdaefcca4ef6ffe425a5fc2325763bfb426599bb0bf4536466efabe709" -dependencies = [ - "arrow", - "arrow-buffer", - "async-trait", - "chrono", - "dashmap", - "datafusion-common", - "datafusion-expr", - "datafusion-physical-expr-common", - "futures", - "log", - "object_store", - "parking_lot", - "rand", - "tempfile", - "url", -] - -[[package]] -name = "datafusion-expr" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "574b9b6977fedbd2a611cbff12e5caf90f31640ad9dc5870f152836d94bad0dd" -dependencies = [ - "arrow", - "async-trait", - "chrono", - "datafusion-common", - "datafusion-doc", - "datafusion-expr-common", - "datafusion-functions-aggregate-common", - "datafusion-functions-window-common", - "datafusion-physical-expr-common", - "indexmap", - "itertools", - "paste", - "recursive", - "serde_json", - "sqlparser", -] - -[[package]] -name = "datafusion-expr-common" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d7c3adf3db8bf61e92eb90cb659c8e8b734593a8f7c8e12a843c7ddba24b87e" -dependencies = [ - "arrow", - "datafusion-common", - "indexmap", - "itertools", - "paste", -] - -[[package]] -name = "datafusion-ffi" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b95173344d04ba62755c949bf44f8d1a6e4414cf6392a635db96c07e711b9a3c" -dependencies = [ - "abi_stable", - "arrow", - "arrow-schema", - "async-ffi", - "async-trait", - "datafusion-catalog", - "datafusion-common", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-functions-aggregate-common", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-proto", - "datafusion-proto-common", - "datafusion-session", - "futures", - "log", - "prost", - "semver", - "tokio", -] - -[[package]] -name = "datafusion-functions" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f28aa4e10384e782774b10e72aca4d93ef7b31aa653095d9d4536b0a3dbc51b6" -dependencies = [ - "arrow", - "arrow-buffer", - "base64", - "blake2", - "blake3", - "chrono", - "chrono-tz", - "datafusion-common", - "datafusion-doc", - "datafusion-execution", - "datafusion-expr", - "datafusion-expr-common", - "datafusion-macros", - "hex", - "itertools", - "log", - "md-5", - "memchr", - "num-traits", - "rand", - "regex", - "sha2", - "unicode-segmentation", - "uuid", -] - -[[package]] -name = "datafusion-functions-aggregate" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00aa6217e56098ba84e0a338176fe52f0a84cca398021512c6c8c5eff806d0ad" -dependencies = [ - "ahash", - "arrow", - "datafusion-common", - "datafusion-doc", - "datafusion-execution", - "datafusion-expr", - "datafusion-functions-aggregate-common", - "datafusion-macros", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "half", - "log", - "num-traits", - "paste", -] - -[[package]] -name = "datafusion-functions-aggregate-common" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b511250349407db7c43832ab2de63f5557b19a20dfd236b39ca2c04468b50d47" -dependencies = [ - "ahash", - "arrow", - "datafusion-common", - "datafusion-expr-common", - "datafusion-physical-expr-common", -] - -[[package]] -name = "datafusion-functions-nested" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef13a858e20d50f0a9bb5e96e7ac82b4e7597f247515bccca4fdd2992df0212a" -dependencies = [ - "arrow", - "arrow-ord", - "datafusion-common", - "datafusion-doc", - "datafusion-execution", - "datafusion-expr", - "datafusion-expr-common", - "datafusion-functions", - "datafusion-functions-aggregate", - "datafusion-functions-aggregate-common", - "datafusion-macros", - "datafusion-physical-expr-common", - "hashbrown 0.16.1", - "itertools", - "itoa", - "log", - "paste", -] - -[[package]] -name = "datafusion-functions-table" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b40d3f5bbb3905f9ccb1ce9485a9595c77b69758a7c24d3ba79e334ff51e7e" -dependencies = [ - "arrow", - "async-trait", - "datafusion-catalog", - "datafusion-common", - "datafusion-expr", - "datafusion-physical-plan", - "parking_lot", - "paste", -] - -[[package]] -name = "datafusion-functions-window" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4e88ec9d57c9b685d02f58bfee7be62d72610430ddcedb82a08e5d9925dbfb6" -dependencies = [ - "arrow", - "datafusion-common", - "datafusion-doc", - "datafusion-expr", - "datafusion-functions-window-common", - "datafusion-macros", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "log", - "paste", -] - -[[package]] -name = "datafusion-functions-window-common" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8307bb93519b1a91913723a1130cfafeee3f72200d870d88e91a6fc5470ede5c" -dependencies = [ - "datafusion-common", - "datafusion-physical-expr-common", -] - -[[package]] -name = "datafusion-java-ffi-example" -version = "0.1.0" -dependencies = [ - "arrow", - "datafusion", - "datafusion-ffi", - "jni", - "tokio", -] - -[[package]] -name = "datafusion-macros" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e367e6a71051d0ebdd29b2f85d12059b38b1d1f172c6906e80016da662226bd" -dependencies = [ - "datafusion-doc", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "datafusion-optimizer" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e929015451a67f77d9d8b727b2bf3a40c4445fdef6cdc53281d7d97c76888ace" -dependencies = [ - "arrow", - "chrono", - "datafusion-common", - "datafusion-expr", - "datafusion-expr-common", - "datafusion-physical-expr", - "indexmap", - "itertools", - "log", - "recursive", - "regex", - "regex-syntax", -] - -[[package]] -name = "datafusion-physical-expr" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b1e68aba7a4b350401cfdf25a3d6f989ad898a7410164afe9ca52080244cb59" -dependencies = [ - "ahash", - "arrow", - "datafusion-common", - "datafusion-expr", - "datafusion-expr-common", - "datafusion-functions-aggregate-common", - "datafusion-physical-expr-common", - "half", - "hashbrown 0.16.1", - "indexmap", - "itertools", - "parking_lot", - "paste", - "petgraph", - "recursive", - "tokio", -] - -[[package]] -name = "datafusion-physical-expr-adapter" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea22315f33cf2e0adc104e8ec42e285f6ed93998d565c65e82fec6a9ee9f9db4" -dependencies = [ - "arrow", - "datafusion-common", - "datafusion-expr", - "datafusion-functions", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "itertools", -] - -[[package]] -name = "datafusion-physical-expr-common" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b04b45ea8ad3ac2d78f2ea2a76053e06591c9629c7a603eda16c10649ecf4362" -dependencies = [ - "ahash", - "arrow", - "chrono", - "datafusion-common", - "datafusion-expr-common", - "hashbrown 0.16.1", - "indexmap", - "itertools", - "parking_lot", -] - -[[package]] -name = "datafusion-physical-optimizer" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7cb13397809a425918f608dfe8653f332015a3e330004ab191b4404187238b95" -dependencies = [ - "arrow", - "datafusion-common", - "datafusion-execution", - "datafusion-expr", - "datafusion-expr-common", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-pruning", - "itertools", - "recursive", -] - -[[package]] -name = "datafusion-physical-plan" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5edc023675791af9d5fb4cc4c24abf5f7bd3bd4dcf9e5bd90ea1eff6976dcc79" -dependencies = [ - "ahash", - "arrow", - "arrow-ord", - "arrow-schema", - "async-trait", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-execution", - "datafusion-expr", - "datafusion-functions", - "datafusion-functions-aggregate-common", - "datafusion-functions-window-common", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "futures", - "half", - "hashbrown 0.16.1", - "indexmap", - "itertools", - "log", - "num-traits", - "parking_lot", - "pin-project-lite", - "tokio", -] - -[[package]] -name = "datafusion-proto" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a387aaef949dc16bb6abc81bd1af850ec7449183aef011214f9724957495738" -dependencies = [ - "arrow", - "chrono", - "datafusion-catalog", - "datafusion-catalog-listing", - "datafusion-common", - "datafusion-datasource", - "datafusion-datasource-arrow", - "datafusion-datasource-csv", - "datafusion-datasource-json", - "datafusion-datasource-parquet", - "datafusion-execution", - "datafusion-expr", - "datafusion-functions-table", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-proto-common", - "object_store", - "prost", - "rand", -] - -[[package]] -name = "datafusion-proto-common" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16e614c7c53a9c304c6a850b821010bb492e57300311835f1180613f9d2c63d9" -dependencies = [ - "arrow", - "datafusion-common", - "prost", -] - -[[package]] -name = "datafusion-pruning" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac8c76860e355616555081cab5968cec1af7a80701ff374510860bcd567e365a" -dependencies = [ - "arrow", - "datafusion-common", - "datafusion-datasource", - "datafusion-expr-common", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "itertools", - "log", -] - -[[package]] -name = "datafusion-session" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5412111aa48e2424ba926112e192f7a6b7e4ccb450145d25ce5ede9f19dc491e" -dependencies = [ - "async-trait", - "datafusion-common", - "datafusion-execution", - "datafusion-expr", - "datafusion-physical-plan", - "parking_lot", -] - -[[package]] -name = "datafusion-sql" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa0d133ddf8b9b3b872acac900157f783e7b879fe9a6bccf389abebbfac45ec1" -dependencies = [ - "arrow", - "bigdecimal", - "chrono", - "datafusion-common", - "datafusion-expr", - "datafusion-functions-nested", - "indexmap", - "log", - "recursive", - "regex", - "sqlparser", -] - -[[package]] -name = "digest" -version = "0.10.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" -dependencies = [ - "block-buffer", - "crypto-common", - "subtle", -] - -[[package]] -name = "displaydoc" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ac70aa55017e108007fbaf5aa0f54b021c98f92ff8af59d42eda9da96e3dd4f" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "either" -version = "1.16.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e" - -[[package]] -name = "equivalent" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" - -[[package]] -name = "errno" -version = "0.3.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" -dependencies = [ - "libc", - "windows-sys 0.61.2", -] - -[[package]] -name = "fastrand" -version = "2.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" - -[[package]] -name = "find-msvc-tools" -version = "0.1.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" - -[[package]] -name = "fixedbitset" -version = "0.5.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" - -[[package]] -name = "flatbuffers" -version = "25.12.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35f6839d7b3b98adde531effaf34f0c2badc6f4735d26fe74709d8e513a96ef3" -dependencies = [ - "bitflags", - "rustc_version", -] - -[[package]] -name = "flate2" -version = "1.1.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" -dependencies = [ - "crc32fast", - "miniz_oxide", - "zlib-rs", -] - -[[package]] -name = "foldhash" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" - -[[package]] -name = "foldhash" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" - -[[package]] -name = "form_urlencoded" -version = "1.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf" -dependencies = [ - "percent-encoding", -] - -[[package]] -name = "futures" -version = "0.3.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d" -dependencies = [ - "futures-channel", - "futures-core", - "futures-executor", - "futures-io", - "futures-sink", - "futures-task", - "futures-util", -] - -[[package]] -name = "futures-channel" -version = "0.3.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d" -dependencies = [ - "futures-core", - "futures-sink", -] - -[[package]] -name = "futures-core" -version = "0.3.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" - -[[package]] -name = "futures-executor" -version = "0.3.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d" -dependencies = [ - "futures-core", - "futures-task", - "futures-util", -] - -[[package]] -name = "futures-io" -version = "0.3.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718" - -[[package]] -name = "futures-macro" -version = "0.3.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "futures-sink" -version = "0.3.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893" - -[[package]] -name = "futures-task" -version = "0.3.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" - -[[package]] -name = "futures-util" -version = "0.3.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" -dependencies = [ - "futures-channel", - "futures-core", - "futures-io", - "futures-macro", - "futures-sink", - "futures-task", - "memchr", - "pin-project-lite", - "slab", -] - -[[package]] -name = "generational-arena" -version = "0.2.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "877e94aff08e743b651baaea359664321055749b398adff8740a7399af7796e7" -dependencies = [ - "cfg-if", -] - -[[package]] -name = "generic-array" -version = "0.14.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" -dependencies = [ - "typenum", - "version_check", -] - -[[package]] -name = "getrandom" -version = "0.2.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" -dependencies = [ - "cfg-if", - "libc", - "wasi", -] - -[[package]] -name = "getrandom" -version = "0.3.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" -dependencies = [ - "cfg-if", - "libc", - "r-efi 5.3.0", - "wasip2", -] - -[[package]] -name = "getrandom" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" -dependencies = [ - "cfg-if", - "libc", - "r-efi 6.0.0", - "wasip2", - "wasip3", -] - -[[package]] -name = "glob" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" - -[[package]] -name = "half" -version = "2.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" -dependencies = [ - "cfg-if", - "crunchy", - "num-traits", - "zerocopy", -] - -[[package]] -name = "hashbrown" -version = "0.14.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" - -[[package]] -name = "hashbrown" -version = "0.15.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" -dependencies = [ - "foldhash 0.1.5", -] - -[[package]] -name = "hashbrown" -version = "0.16.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" -dependencies = [ - "allocator-api2", - "equivalent", - "foldhash 0.2.0", -] - -[[package]] -name = "hashbrown" -version = "0.17.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" - -[[package]] -name = "heck" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" - -[[package]] -name = "hex" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" - -[[package]] -name = "http" -version = "1.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6970f50e31d6fc17d3fa27329444bfa74e196cf62e95052a3f6fee181dba6425" -dependencies = [ - "bytes", - "itoa", -] - -[[package]] -name = "humantime" -version = "2.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424" - -[[package]] -name = "iana-time-zone" -version = "0.1.65" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e31bc9ad994ba00e440a8aa5c9ef0ec67d5cb5e5cb0cc7f8b744a35b389cc470" -dependencies = [ - "android_system_properties", - "core-foundation-sys", - "iana-time-zone-haiku", - "js-sys", - "log", - "wasm-bindgen", - "windows-core", -] - -[[package]] -name = "iana-time-zone-haiku" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" -dependencies = [ - "cc", -] - -[[package]] -name = "icu_collections" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2984d1cd16c883d7935b9e07e44071dca8d917fd52ecc02c04d5fa0b5a3f191c" -dependencies = [ - "displaydoc", - "potential_utf", - "utf8_iter", - "yoke", - "zerofrom", - "zerovec", -] - -[[package]] -name = "icu_locale_core" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92219b62b3e2b4d88ac5119f8904c10f8f61bf7e95b640d25ba3075e6cac2c29" -dependencies = [ - "displaydoc", - "litemap", - "tinystr", - "writeable", - "zerovec", -] - -[[package]] -name = "icu_normalizer" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c56e5ee99d6e3d33bd91c5d85458b6005a22140021cc324cea84dd0e72cff3b4" -dependencies = [ - "icu_collections", - "icu_normalizer_data", - "icu_properties", - "icu_provider", - "smallvec", - "zerovec", -] - -[[package]] -name = "icu_normalizer_data" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da3be0ae77ea334f4da67c12f149704f19f81d1adf7c51cf482943e84a2bad38" - -[[package]] -name = "icu_properties" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bee3b67d0ea5c2cca5003417989af8996f8604e34fb9ddf96208a033901e70de" -dependencies = [ - "icu_collections", - "icu_locale_core", - "icu_properties_data", - "icu_provider", - "zerotrie", - "zerovec", -] - -[[package]] -name = "icu_properties_data" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e2bbb201e0c04f7b4b3e14382af113e17ba4f63e2c9d2ee626b720cbce54a14" - -[[package]] -name = "icu_provider" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "139c4cf31c8b5f33d7e199446eff9c1e02decfc2f0eec2c8d71f65befa45b421" -dependencies = [ - "displaydoc", - "icu_locale_core", - "writeable", - "yoke", - "zerofrom", - "zerotrie", - "zerovec", -] - -[[package]] -name = "id-arena" -version = "2.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" - -[[package]] -name = "idna" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de" -dependencies = [ - "idna_adapter", - "smallvec", - "utf8_iter", -] - -[[package]] -name = "idna_adapter" -version = "1.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb68373c0d6620ef8105e855e7745e18b0d00d3bdb07fb532e434244cdb9a714" -dependencies = [ - "icu_normalizer", - "icu_properties", -] - -[[package]] -name = "indexmap" -version = "2.14.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" -dependencies = [ - "equivalent", - "hashbrown 0.17.1", - "serde", - "serde_core", -] - -[[package]] -name = "integer-encoding" -version = "3.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" - -[[package]] -name = "itertools" -version = "0.14.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" -dependencies = [ - "either", -] - -[[package]] -name = "itoa" -version = "1.0.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" - -[[package]] -name = "jni" -version = "0.21.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a87aa2bb7d2af34197c04845522473242e1aa17c12f4935d5856491a7fb8c97" -dependencies = [ - "cesu8", - "cfg-if", - "combine", - "jni-sys 0.3.1", - "log", - "thiserror 1.0.69", - "walkdir", - "windows-sys 0.45.0", -] - -[[package]] -name = "jni-sys" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41a652e1f9b6e0275df1f15b32661cf0d4b78d4d87ddec5e0c3c20f097433258" -dependencies = [ - "jni-sys 0.4.1", -] - -[[package]] -name = "jni-sys" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6377a88cb3910bee9b0fa88d4f42e1d2da8e79915598f65fb0c7ee14c878af2" -dependencies = [ - "jni-sys-macros", -] - -[[package]] -name = "jni-sys-macros" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38c0b942f458fe50cdac086d2f946512305e5631e720728f2a61aabcd47a6264" -dependencies = [ - "quote", - "syn 2.0.117", -] - -[[package]] -name = "jobserver" -version = "0.1.34" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" -dependencies = [ - "getrandom 0.3.4", - "libc", -] - -[[package]] -name = "js-sys" -version = "0.3.100" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2025f20d7a4fa7785846e7b63d10a76d3f1cee98ee5cb79ea59703f95e42162" -dependencies = [ - "cfg-if", - "futures-util", - "wasm-bindgen", -] - -[[package]] -name = "leb128fmt" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" - -[[package]] -name = "lexical-core" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d8d125a277f807e55a77304455eb7b1cb52f2b18c143b60e766c120bd64a594" -dependencies = [ - "lexical-parse-float", - "lexical-parse-integer", - "lexical-util", - "lexical-write-float", - "lexical-write-integer", -] - -[[package]] -name = "lexical-parse-float" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52a9f232fbd6f550bc0137dcb5f99ab674071ac2d690ac69704593cb4abbea56" -dependencies = [ - "lexical-parse-integer", - "lexical-util", -] - -[[package]] -name = "lexical-parse-integer" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a7a039f8fb9c19c996cd7b2fcce303c1b2874fe1aca544edc85c4a5f8489b34" -dependencies = [ - "lexical-util", -] - -[[package]] -name = "lexical-util" -version = "1.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2604dd126bb14f13fb5d1bd6a66155079cb9fa655b37f875b3a742c705dbed17" - -[[package]] -name = "lexical-write-float" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50c438c87c013188d415fbabbb1dceb44249ab81664efbd31b14ae55dabb6361" -dependencies = [ - "lexical-util", - "lexical-write-integer", -] - -[[package]] -name = "lexical-write-integer" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "409851a618475d2d5796377cad353802345cba92c867d9fbcde9cf4eac4e14df" -dependencies = [ - "lexical-util", -] - -[[package]] -name = "libbz2-rs-sys" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34b357333733e8260735ba5894eb928c02ecc69c78715f01a8019e7fa7f2db4c" - -[[package]] -name = "libc" -version = "0.2.186" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" - -[[package]] -name = "libloading" -version = "0.7.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" -dependencies = [ - "cfg-if", - "winapi", -] - -[[package]] -name = "liblzma" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6033b77c21d1f56deeae8014eb9fbe7bdf1765185a6c508b5ca82eeaed7f899" -dependencies = [ - "liblzma-sys", -] - -[[package]] -name = "liblzma-sys" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a60851d15cd8c5346eca4ab8babff585be2ae4bc8097c067291d3ffe2add3b6" -dependencies = [ - "cc", - "libc", - "pkg-config", -] - -[[package]] -name = "libm" -version = "0.2.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" - -[[package]] -name = "linux-raw-sys" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" - -[[package]] -name = "litemap" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0" - -[[package]] -name = "lock_api" -version = "0.4.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" -dependencies = [ - "scopeguard", -] - -[[package]] -name = "log" -version = "0.4.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "953f07c43838f8e6f9758cab68bf5bed85465e7587ebe0b823f1bcd81978ad3a" - -[[package]] -name = "lz4_flex" -version = "0.13.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ef0d4ed8669f8f8826eb00dc878084aa8f253506c4fd5e8f58f5bce72ddb97e" -dependencies = [ - "twox-hash", -] - -[[package]] -name = "md-5" -version = "0.10.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" -dependencies = [ - "cfg-if", - "digest", -] - -[[package]] -name = "memchr" -version = "2.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8" - -[[package]] -name = "miniz_oxide" -version = "0.8.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" -dependencies = [ - "adler2", - "simd-adler32", -] - -[[package]] -name = "num-bigint" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" -dependencies = [ - "num-integer", - "num-traits", -] - -[[package]] -name = "num-complex" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" -dependencies = [ - "num-traits", -] - -[[package]] -name = "num-integer" -version = "0.1.46" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" -dependencies = [ - "num-traits", -] - -[[package]] -name = "num-traits" -version = "0.2.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" -dependencies = [ - "autocfg", - "libm", -] - -[[package]] -name = "object" -version = "0.37.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe" -dependencies = [ - "memchr", -] - -[[package]] -name = "object_store" -version = "0.13.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "622acbc9100d3c10e2ee15804b0caa40e55c933d5aa53814cd520805b7958a49" -dependencies = [ - "async-trait", - "bytes", - "chrono", - "futures-channel", - "futures-core", - "futures-util", - "http", - "humantime", - "itertools", - "parking_lot", - "percent-encoding", - "thiserror 2.0.18", - "tokio", - "tracing", - "url", - "walkdir", - "wasm-bindgen-futures", - "web-time", -] - -[[package]] -name = "once_cell" -version = "1.21.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" - -[[package]] -name = "ordered-float" -version = "2.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c" -dependencies = [ - "num-traits", -] - -[[package]] -name = "parking_lot" -version = "0.12.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" -dependencies = [ - "lock_api", - "parking_lot_core", -] - -[[package]] -name = "parking_lot_core" -version = "0.9.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" -dependencies = [ - "cfg-if", - "libc", - "redox_syscall", - "smallvec", - "windows-link", -] - -[[package]] -name = "parquet" -version = "58.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dafa7d01085b62a47dd0c1829550a0a36710ea9c4fe358a05a85477cec8a908" -dependencies = [ - "ahash", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-ipc", - "arrow-schema", - "arrow-select", - "base64", - "brotli", - "bytes", - "chrono", - "flate2", - "futures", - "half", - "hashbrown 0.17.1", - "lz4_flex", - "num-bigint", - "num-integer", - "num-traits", - "object_store", - "paste", - "seq-macro", - "simdutf8", - "snap", - "thrift", - "tokio", - "twox-hash", - "zstd", -] - -[[package]] -name = "paste" -version = "1.0.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" - -[[package]] -name = "percent-encoding" -version = "2.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" - -[[package]] -name = "petgraph" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455" -dependencies = [ - "fixedbitset", - "hashbrown 0.15.5", - "indexmap", - "serde", -] - -[[package]] -name = "phf" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "913273894cec178f401a31ec4b656318d95473527be05c0752cc41cdc32be8b7" -dependencies = [ - "phf_shared", -] - -[[package]] -name = "phf_shared" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06005508882fb681fd97892ecff4b7fd0fee13ef1aa569f8695dae7ab9099981" -dependencies = [ - "siphasher", -] - -[[package]] -name = "pin-project-lite" -version = "0.2.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" - -[[package]] -name = "pkg-config" -version = "0.3.33" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e" - -[[package]] -name = "potential_utf" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0103b1cef7ec0cf76490e969665504990193874ea05c85ff9bab8b911d0a0564" -dependencies = [ - "zerovec", -] - -[[package]] -name = "ppv-lite86" -version = "0.2.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" -dependencies = [ - "zerocopy", -] - -[[package]] -name = "prettyplease" -version = "0.2.37" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" -dependencies = [ - "proc-macro2", - "syn 2.0.117", -] - -[[package]] -name = "proc-macro2" -version = "1.0.106" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" -dependencies = [ - "unicode-ident", -] - -[[package]] -name = "prost" -version = "0.14.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "528ac67416ff8646872a3c02cad9cc4ee5dc9f9540c9b10771855c95cb2e5ae1" -dependencies = [ - "bytes", - "prost-derive", -] - -[[package]] -name = "prost-derive" -version = "0.14.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b570b25f7617e43d59005d0990ccb79e950a423952cea19671b7a876da390adf" -dependencies = [ - "anyhow", - "itertools", - "proc-macro2", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "psm" -version = "0.1.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "645dbe486e346d9b5de3ef16ede18c26e6c70ad97418f4874b8b1889d6e761ea" -dependencies = [ - "ar_archive_writer", - "cc", -] - -[[package]] -name = "quote" -version = "1.0.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" -dependencies = [ - "proc-macro2", -] - -[[package]] -name = "r-efi" -version = "5.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" - -[[package]] -name = "r-efi" -version = "6.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" - -[[package]] -name = "rand" -version = "0.9.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" -dependencies = [ - "rand_chacha", - "rand_core", -] - -[[package]] -name = "rand_chacha" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" -dependencies = [ - "ppv-lite86", - "rand_core", -] - -[[package]] -name = "rand_core" -version = "0.9.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" -dependencies = [ - "getrandom 0.3.4", -] - -[[package]] -name = "recursive" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e" -dependencies = [ - "recursive-proc-macro-impl", - "stacker", -] - -[[package]] -name = "recursive-proc-macro-impl" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" -dependencies = [ - "quote", - "syn 2.0.117", -] - -[[package]] -name = "redox_syscall" -version = "0.5.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" -dependencies = [ - "bitflags", -] - -[[package]] -name = "regex" -version = "1.12.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1292b7759ae1cb9ec195452d1390a074f0cd8541ab7a5a8c31cd6db45d4a6ba" -dependencies = [ - "aho-corasick", - "memchr", - "regex-automata", - "regex-syntax", -] - -[[package]] -name = "regex-automata" -version = "0.4.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" -dependencies = [ - "aho-corasick", - "memchr", - "regex-syntax", -] - -[[package]] -name = "regex-syntax" -version = "0.8.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4" - -[[package]] -name = "repr_offset" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb1070755bd29dffc19d0971cab794e607839ba2ef4b69a9e6fbc8733c1b72ea" -dependencies = [ - "tstr", -] - -[[package]] -name = "rustc_version" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" -dependencies = [ - "semver", -] - -[[package]] -name = "rustix" -version = "1.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" -dependencies = [ - "bitflags", - "errno", - "libc", - "linux-raw-sys", - "windows-sys 0.61.2", -] - -[[package]] -name = "rustversion" -version = "1.0.22" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" - -[[package]] -name = "ryu" -version = "1.0.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" - -[[package]] -name = "same-file" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" -dependencies = [ - "winapi-util", -] - -[[package]] -name = "scopeguard" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" - -[[package]] -name = "semver" -version = "1.0.28" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" - -[[package]] -name = "seq-macro" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc" - -[[package]] -name = "serde" -version = "1.0.228" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" -dependencies = [ - "serde_core", - "serde_derive", -] - -[[package]] -name = "serde_core" -version = "1.0.228" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" -dependencies = [ - "serde_derive", -] - -[[package]] -name = "serde_derive" -version = "1.0.228" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "serde_json" -version = "1.0.150" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9" -dependencies = [ - "itoa", - "memchr", - "serde", - "serde_core", - "zmij", -] - -[[package]] -name = "sha2" -version = "0.10.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" -dependencies = [ - "cfg-if", - "cpufeatures 0.2.17", - "digest", -] - -[[package]] -name = "shlex" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8fadd59c855ef2080decdef8ff161eb6661b86933c9d82e5ba29dc602a55aba" - -[[package]] -name = "simd-adler32" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214" - -[[package]] -name = "simdutf8" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" - -[[package]] -name = "siphasher" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ee5873ec9cce0195efcb7a4e9507a04cd49aec9c83d0389df45b1ef7ba2e649" - -[[package]] -name = "slab" -version = "0.4.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" - -[[package]] -name = "smallvec" -version = "1.15.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" - -[[package]] -name = "snap" -version = "1.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" - -[[package]] -name = "sqlparser" -version = "0.61.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbf5ea8d4d7c808e1af1cbabebca9a2abe603bcefc22294c5b95018d53200cb7" -dependencies = [ - "log", - "recursive", - "sqlparser_derive", -] - -[[package]] -name = "sqlparser_derive" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6dd45d8fc1c79299bfbb7190e42ccbbdf6a5f52e4a6ad98d92357ea965bd289" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "stable_deref_trait" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" - -[[package]] -name = "stacker" -version = "0.1.24" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "640c8cdd92b6b12f5bcb1803ca3bbf5ab96e5e6b6b96b9ab77dabe9e880b3190" -dependencies = [ - "cc", - "cfg-if", - "libc", - "psm", - "windows-sys 0.61.2", -] - -[[package]] -name = "subtle" -version = "2.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" - -[[package]] -name = "syn" -version = "1.0.109" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - -[[package]] -name = "syn" -version = "2.0.117" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - -[[package]] -name = "synstructure" -version = "0.13.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "tempfile" -version = "3.27.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" -dependencies = [ - "fastrand", - "getrandom 0.4.2", - "once_cell", - "rustix", - "windows-sys 0.61.2", -] - -[[package]] -name = "thiserror" -version = "1.0.69" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" -dependencies = [ - "thiserror-impl 1.0.69", -] - -[[package]] -name = "thiserror" -version = "2.0.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" -dependencies = [ - "thiserror-impl 2.0.18", -] - -[[package]] -name = "thiserror-impl" -version = "1.0.69" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "thiserror-impl" -version = "2.0.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "thrift" -version = "0.17.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" -dependencies = [ - "byteorder", - "integer-encoding", - "ordered-float", -] - -[[package]] -name = "tiny-keccak" -version = "2.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" -dependencies = [ - "crunchy", -] - -[[package]] -name = "tinystr" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8323304221c2a851516f22236c5722a72eaa19749016521d6dff0824447d96d" -dependencies = [ - "displaydoc", - "zerovec", -] - -[[package]] -name = "tokio" -version = "1.52.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fc7f01b389ac15039e4dc9531aa973a135d7a4135281b12d7c1bc79fd57fffe" -dependencies = [ - "bytes", - "pin-project-lite", - "tokio-macros", -] - -[[package]] -name = "tokio-macros" -version = "2.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "tokio-stream" -version = "0.1.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32da49809aab5c3bc678af03902d4ccddea2a87d028d86392a4b1560c6906c70" -dependencies = [ - "futures-core", - "pin-project-lite", - "tokio", - "tokio-util", -] - -[[package]] -name = "tokio-util" -version = "0.7.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098" -dependencies = [ - "bytes", - "futures-core", - "futures-sink", - "pin-project-lite", - "tokio", -] - -[[package]] -name = "tracing" -version = "0.1.44" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" -dependencies = [ - "pin-project-lite", - "tracing-attributes", - "tracing-core", -] - -[[package]] -name = "tracing-attributes" -version = "0.1.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "tracing-core" -version = "0.1.36" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" -dependencies = [ - "once_cell", -] - -[[package]] -name = "tstr" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f8e0294f14baae476d0dd0a2d780b2e24d66e349a9de876f5126777a37bdba7" -dependencies = [ - "tstr_proc_macros", -] - -[[package]] -name = "tstr_proc_macros" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e78122066b0cb818b8afd08f7ed22f7fdbc3e90815035726f0840d0d26c0747a" - -[[package]] -name = "twox-hash" -version = "2.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c" - -[[package]] -name = "typed-arena" -version = "2.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6af6ae20167a9ece4bcb41af5b80f8a1f1df981f6391189ce00fd257af04126a" - -[[package]] -name = "typenum" -version = "1.20.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6f5e870be6c3b371b77fe0ee0bafb859fa4964b4404c27de1d380043c4dda20" - -[[package]] -name = "typewit" -version = "1.15.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "214ca0b2191785cbc06209b9ca1861e048e39b5ba33574b3cedd58363d5bb5f6" - -[[package]] -name = "unicode-ident" -version = "1.0.24" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" - -[[package]] -name = "unicode-segmentation" -version = "1.13.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6f5d3c3b1bf09027a88a6bc961fc00497d651009560b5463668dc81b0fa87a8" - -[[package]] -name = "unicode-width" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" - -[[package]] -name = "unicode-xid" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" - -[[package]] -name = "url" -version = "2.5.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed" -dependencies = [ - "form_urlencoded", - "idna", - "percent-encoding", - "serde", -] - -[[package]] -name = "utf8_iter" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" - -[[package]] -name = "uuid" -version = "1.23.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "144d6b123cef80b301b8f72a9e2ca4370ddec21950d0a103dd22c437006d2db7" -dependencies = [ - "getrandom 0.4.2", - "js-sys", - "wasm-bindgen", -] - -[[package]] -name = "version_check" -version = "0.9.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" - -[[package]] -name = "walkdir" -version = "2.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" -dependencies = [ - "same-file", - "winapi-util", -] - -[[package]] -name = "wasi" -version = "0.11.1+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" - -[[package]] -name = "wasip2" -version = "1.0.3+wasi-0.2.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6" -dependencies = [ - "wit-bindgen 0.57.1", -] - -[[package]] -name = "wasip3" -version = "0.4.0+wasi-0.3.0-rc-2026-01-06" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" -dependencies = [ - "wit-bindgen 0.51.0", -] - -[[package]] -name = "wasm-bindgen" -version = "0.2.123" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a254a4b10c19a76f09a27640e7ffbf9bc30bf67e16a3bf28aaefa4920fe81563" -dependencies = [ - "cfg-if", - "once_cell", - "rustversion", - "wasm-bindgen-macro", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-futures" -version = "0.4.73" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54568702fabf5d4849ce2b90fadfa64168a097eaf4b351ce9df8b687a0086aaf" -dependencies = [ - "js-sys", - "wasm-bindgen", -] - -[[package]] -name = "wasm-bindgen-macro" -version = "0.2.123" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24a40fc75b0ec6f3746ceb10d36f53a93dcd68a93b11b6445983945d79eba0dc" -dependencies = [ - "quote", - "wasm-bindgen-macro-support", -] - -[[package]] -name = "wasm-bindgen-macro-support" -version = "0.2.123" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "908f34bd9b9ce3d4caf07b72dfab63d61504d156856c6bd3cd87fa350cf3985b" -dependencies = [ - "bumpalo", - "proc-macro2", - "quote", - "syn 2.0.117", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-shared" -version = "0.2.123" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7acbf7616c27b194bbb550bf77ed0c2c3e5b7fd1260a93082b95fb7f47959b92" -dependencies = [ - "unicode-ident", -] - -[[package]] -name = "wasm-encoder" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" -dependencies = [ - "leb128fmt", - "wasmparser", -] - -[[package]] -name = "wasm-metadata" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" -dependencies = [ - "anyhow", - "indexmap", - "wasm-encoder", - "wasmparser", -] - -[[package]] -name = "wasmparser" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" -dependencies = [ - "bitflags", - "hashbrown 0.15.5", - "indexmap", - "semver", -] - -[[package]] -name = "web-time" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" -dependencies = [ - "js-sys", - "wasm-bindgen", -] - -[[package]] -name = "winapi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" -dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", -] - -[[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" - -[[package]] -name = "winapi-util" -version = "0.1.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" -dependencies = [ - "windows-sys 0.61.2", -] - -[[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" - -[[package]] -name = "windows-core" -version = "0.62.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" -dependencies = [ - "windows-implement", - "windows-interface", - "windows-link", - "windows-result", - "windows-strings", -] - -[[package]] -name = "windows-implement" -version = "0.60.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "windows-interface" -version = "0.59.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "windows-link" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" - -[[package]] -name = "windows-result" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" -dependencies = [ - "windows-link", -] - -[[package]] -name = "windows-strings" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" -dependencies = [ - "windows-link", -] - -[[package]] -name = "windows-sys" -version = "0.45.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" -dependencies = [ - "windows-targets", -] - -[[package]] -name = "windows-sys" -version = "0.61.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" -dependencies = [ - "windows-link", -] - -[[package]] -name = "windows-targets" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" -dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", -] - -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" - -[[package]] -name = "windows_aarch64_msvc" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" - -[[package]] -name = "windows_i686_gnu" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" - -[[package]] -name = "windows_i686_msvc" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" - -[[package]] -name = "windows_x86_64_gnu" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" - -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" - -[[package]] -name = "windows_x86_64_msvc" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" - -[[package]] -name = "wit-bindgen" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" -dependencies = [ - "wit-bindgen-rust-macro", -] - -[[package]] -name = "wit-bindgen" -version = "0.57.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" - -[[package]] -name = "wit-bindgen-core" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" -dependencies = [ - "anyhow", - "heck", - "wit-parser", -] - -[[package]] -name = "wit-bindgen-rust" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" -dependencies = [ - "anyhow", - "heck", - "indexmap", - "prettyplease", - "syn 2.0.117", - "wasm-metadata", - "wit-bindgen-core", - "wit-component", -] - -[[package]] -name = "wit-bindgen-rust-macro" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" -dependencies = [ - "anyhow", - "prettyplease", - "proc-macro2", - "quote", - "syn 2.0.117", - "wit-bindgen-core", - "wit-bindgen-rust", -] - -[[package]] -name = "wit-component" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" -dependencies = [ - "anyhow", - "bitflags", - "indexmap", - "log", - "serde", - "serde_derive", - "serde_json", - "wasm-encoder", - "wasm-metadata", - "wasmparser", - "wit-parser", -] - -[[package]] -name = "wit-parser" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" -dependencies = [ - "anyhow", - "id-arena", - "indexmap", - "log", - "semver", - "serde", - "serde_derive", - "serde_json", - "unicode-xid", - "wasmparser", -] - -[[package]] -name = "writeable" -version = "0.6.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" - -[[package]] -name = "yoke" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "709fe23a0424b6a435d82152b1bd3fdfb0833487d5fa90d05d42762a9891fef5" -dependencies = [ - "stable_deref_trait", - "yoke-derive", - "zerofrom", -] - -[[package]] -name = "yoke-derive" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", - "synstructure", -] - -[[package]] -name = "zerocopy" -version = "0.8.52" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce1022995ff5ff5d841ad7d994facc23098cd40152f2c1d11cd607c6f530653f" -dependencies = [ - "zerocopy-derive", -] - -[[package]] -name = "zerocopy-derive" -version = "0.8.52" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ae7f38b72ec2a254e2b87ef277cf2cd4fb97cbebf944faa6f33354da0867930" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "zerofrom" -version = "0.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ec05a11813ea801ff6d75110ad09cd0824ddba17dfe17128ea0d5f68e6c5272" -dependencies = [ - "zerofrom-derive", -] - -[[package]] -name = "zerofrom-derive" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", - "synstructure", -] - -[[package]] -name = "zerotrie" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f9152d31db0792fa83f70fb2f83148effb5c1f5b8c7686c3459e361d9bc20bf" -dependencies = [ - "displaydoc", - "yoke", - "zerofrom", -] - -[[package]] -name = "zerovec" -version = "0.11.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90f911cbc359ab6af17377d242225f4d75119aec87ea711a880987b18cd7b239" -dependencies = [ - "yoke", - "zerofrom", - "zerovec-derive", -] - -[[package]] -name = "zerovec-derive" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "zlib-rs" -version = "0.6.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513" - -[[package]] -name = "zmij" -version = "1.0.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" - -[[package]] -name = "zstd" -version = "0.13.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" -dependencies = [ - "zstd-safe", -] - -[[package]] -name = "zstd-safe" -version = "7.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" -dependencies = [ - "zstd-sys", -] - -[[package]] -name = "zstd-sys" -version = "2.0.16+zstd.1.5.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" -dependencies = [ - "cc", - "pkg-config", -] diff --git a/examples/native/Cargo.toml b/examples/native/Cargo.toml index d10298b..2d51ac2 100644 --- a/examples/native/Cargo.toml +++ b/examples/native/Cargo.toml @@ -20,8 +20,8 @@ publish = false crate-type = ["cdylib", "rlib"] [dependencies] -arrow = { version = "58", features = ["ffi"] } -datafusion = { version = "53.1.0" } -datafusion-ffi = "53.1.0" -jni = "0.21" -tokio = { version = "1", features = ["rt-multi-thread"] } +arrow = { workspace = true } +datafusion = { workspace = true } +datafusion-ffi = { workspace = true } +jni = { workspace = true } +tokio = { workspace = true } diff --git a/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExample.java b/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExample.java index dcb8441..bcb1765 100644 --- a/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExample.java +++ b/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExample.java @@ -39,7 +39,7 @@ *

How to run (from the fork repo root): * *

{@code
- * (cd examples/native && cargo build --release)
+ * cargo build -p datafusion-java-ffi-example --release
  * mvn -B install -DskipTests -Drat.skip=true \
  *     -Ddatafusion.native.profile=release
  * mvn -B -pl examples exec:exec \
diff --git a/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExampleNative.java b/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExampleNative.java
index 3ca2784..ed1eaf1 100644
--- a/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExampleNative.java
+++ b/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExampleNative.java
@@ -34,10 +34,11 @@
  *
  * 
    *
  1. Absolute path passed via {@code -Dexample.ffi.lib.path=/abs/path/to/lib...}. - *
  2. {@code examples/native/target/release/} relative to the current working - * directory (the default when invoked via {@code mvn exec:java} from the repo root). - *
  3. {@code examples/native/target/debug/} as a fallback for {@code cargo build} - * without {@code --release}. + *
  4. {@code rust-target/release/} relative to the current working directory + * (the workspace output dir; default when invoked via {@code mvn exec:java} from the + * repo root). + *
  5. {@code rust-target/debug/} as a fallback for {@code cargo build} without + * {@code --release}. *
* *

If none of these exist, an {@link UnsatisfiedLinkError} surfaces with the search list so the @@ -73,14 +74,15 @@ private static void loadLibrary() { Path explicit = optionalPath(System.getProperty("example.ffi.lib.path")); // Cover both common cwds: repo root (mvn exec from datafusion-java/) and - // the examples module (mvn exec from datafusion-java/examples/). + // the examples module (mvn exec from datafusion-java/examples/). The + // workspace writes to `rust-target/` at the repo root. Path[] candidates = new Path[] { explicit, - Paths.get("examples", "native", "target", "release", mapped), - Paths.get("examples", "native", "target", "debug", mapped), - Paths.get("native", "target", "release", mapped), - Paths.get("native", "target", "debug", mapped), + Paths.get("rust-target", "release", mapped), + Paths.get("rust-target", "debug", mapped), + Paths.get("..", "rust-target", "release", mapped), + Paths.get("..", "rust-target", "debug", mapped), }; for (Path candidate : candidates) { @@ -99,7 +101,7 @@ private static void loadLibrary() { String.format( Locale.ROOT, "Example native library %s not found. Searched: [%s]. " - + "Build with 'cd examples/native && cargo build --release', or pass " + + "Build with 'cargo build -p datafusion-java-ffi-example --release', or pass " + "-Dexample.ffi.lib.path=.", mapped, searched)); diff --git a/native/Cargo.toml b/native/Cargo.toml index b7e96af..aa56ca6 100644 --- a/native/Cargo.toml +++ b/native/Cargo.toml @@ -69,26 +69,20 @@ protoc = ["datafusion-substrait?/protoc"] runtime-metrics = ["dep:tokio-metrics"] [dependencies] -arrow = { version = "58", features = ["ffi"] } -async-trait = "0.1" -datafusion = { version = "53.1.0", features = ["avro"] } -datafusion-ffi = "53.1.0" -datafusion-proto = "53.1.0" -datafusion-substrait = { version = "53.1.0", optional = true } -futures = "0.3" -jni = "0.21" -# Pin to the same major as DataFusion 53.1 pulls in transitively (0.13.x) -# so we share the same `dyn ObjectStore` vtable and don't double-link. -object_store = { version = "0.13", default-features = false } -prost = "0.14" -tokio = { version = "1", features = ["rt-multi-thread"] } - -# Tokio runtime metrics. Optional + cfg-gated: this crate's API surface lives -# behind `--cfg tokio_unstable`, so enabling the `runtime-metrics` feature also -# requires the caller to set `RUSTFLAGS="--cfg tokio_unstable"` at build time. -tokio-metrics = { version = "0.5", optional = true } -url = "2" +arrow = { workspace = true } +async-trait = { workspace = true } +datafusion = { workspace = true, features = ["avro"] } +datafusion-ffi = { workspace = true } +datafusion-proto = { workspace = true } +datafusion-substrait = { workspace = true, optional = true } +futures = { workspace = true } +jni = { workspace = true } +object_store = { workspace = true } +prost = { workspace = true } +tokio = { workspace = true } +tokio-metrics = { workspace = true, optional = true } +url = { workspace = true } [build-dependencies] -prost-build = "0.14" -protoc-bin-vendored = "3" +prost-build = { workspace = true } +protoc-bin-vendored = { workspace = true } diff --git a/native/src/runtime_metrics.rs b/native/src/runtime_metrics.rs index e69410e..ddd8698 100644 --- a/native/src/runtime_metrics.rs +++ b/native/src/runtime_metrics.rs @@ -196,7 +196,7 @@ pub fn runtime_stats() -> JniResult<[i64; STATS_FIELD_COUNT]> { Err( "datafusion-jni was built without the `runtime-metrics` Cargo feature; \ rebuild the native crate with \ - `RUSTFLAGS=\"--cfg tokio_unstable\" cargo build --features runtime-metrics` \ + `RUSTFLAGS=\"--cfg tokio_unstable\" cargo build -p datafusion-jni --features runtime-metrics` \ to enable SessionContext.runtimeStats" .into(), ) diff --git a/pom.xml b/pom.xml index 282feb9..0bf5612 100644 --- a/pom.xml +++ b/pom.xml @@ -175,11 +175,10 @@ under the License. .mvn/** **/target/** - native/target/** + rust-target/** tpch-data/** - - native/Cargo.lock - **/Cargo.lock + + Cargo.lock **/META-INF/services/** diff --git a/spark/native/Cargo.lock b/spark/native/Cargo.lock deleted file mode 100644 index d22d26e..0000000 --- a/spark/native/Cargo.lock +++ /dev/null @@ -1,3655 +0,0 @@ -# This file is automatically @generated by Cargo. -# It is not intended for manual editing. -version = 4 - -[[package]] -name = "abi_stable" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69d6512d3eb05ffe5004c59c206de7f99c34951504056ce23fc953842f12c445" -dependencies = [ - "abi_stable_derive", - "abi_stable_shared", - "const_panic", - "core_extensions", - "crossbeam-channel", - "generational-arena", - "libloading", - "lock_api", - "parking_lot", - "paste", - "repr_offset", - "rustc_version", - "serde", - "serde_derive", - "serde_json", -] - -[[package]] -name = "abi_stable_derive" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7178468b407a4ee10e881bc7a328a65e739f0863615cca4429d43916b05e898" -dependencies = [ - "abi_stable_shared", - "as_derive_utils", - "core_extensions", - "proc-macro2", - "quote", - "rustc_version", - "syn 1.0.109", - "typed-arena", -] - -[[package]] -name = "abi_stable_shared" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2b5df7688c123e63f4d4d649cba63f2967ba7f7861b1664fca3f77d3dad2b63" -dependencies = [ - "core_extensions", -] - -[[package]] -name = "adler2" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" - -[[package]] -name = "ahash" -version = "0.8.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" -dependencies = [ - "cfg-if", - "const-random", - "getrandom 0.3.4", - "once_cell", - "version_check", - "zerocopy", -] - -[[package]] -name = "aho-corasick" -version = "1.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" -dependencies = [ - "memchr", -] - -[[package]] -name = "alloc-no-stdlib" -version = "2.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3" - -[[package]] -name = "alloc-stdlib" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece" -dependencies = [ - "alloc-no-stdlib", -] - -[[package]] -name = "allocator-api2" -version = "0.2.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" - -[[package]] -name = "android_system_properties" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" -dependencies = [ - "libc", -] - -[[package]] -name = "anyhow" -version = "1.0.102" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" - -[[package]] -name = "ar_archive_writer" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4087686b4b0a3427190bae57a1d9a478dbb2d40c5dc1bd6e2b6d797913bdd348" -dependencies = [ - "object", -] - -[[package]] -name = "arrayref" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" - -[[package]] -name = "arrayvec" -version = "0.7.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" - -[[package]] -name = "arrow" -version = "58.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "378530e55cd479eda3c14eb345310799717e6f76d0c332041e8487022166b471" -dependencies = [ - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-csv", - "arrow-data", - "arrow-ipc", - "arrow-json", - "arrow-ord", - "arrow-row", - "arrow-schema", - "arrow-select", - "arrow-string", -] - -[[package]] -name = "arrow-arith" -version = "58.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0ab212d2c1886e802f51c5212d78ebbcbb0bec980fff9dadc1eb8d45cd0b738" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "chrono", - "num-traits", -] - -[[package]] -name = "arrow-array" -version = "58.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfd33d3e92f207444098c75b42de99d329562be0cf686b307b097cc52b4e999e" -dependencies = [ - "ahash", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "chrono", - "chrono-tz", - "half", - "hashbrown 0.17.1", - "num-complex", - "num-integer", - "num-traits", -] - -[[package]] -name = "arrow-buffer" -version = "58.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c6cd424c2693bcdbc150d843dc9d4d137dd2de4782ce6df491ad11a3a0416c0" -dependencies = [ - "bytes", - "half", - "num-bigint", - "num-traits", -] - -[[package]] -name = "arrow-cast" -version = "58.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c5aefb56a2c02e9e2b30746241058b85f8983f0fcff2ba0c6d09006e1cded7f" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-ord", - "arrow-schema", - "arrow-select", - "atoi", - "base64", - "chrono", - "comfy-table", - "half", - "lexical-core", - "num-traits", - "ryu", -] - -[[package]] -name = "arrow-csv" -version = "58.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e94e8cf7e517657a52b91ea1263acf38c4ca62a84655d72458a3359b12ab97de" -dependencies = [ - "arrow-array", - "arrow-cast", - "arrow-schema", - "chrono", - "csv", - "csv-core", - "regex", -] - -[[package]] -name = "arrow-data" -version = "58.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c88210023a2bfee1896af366309a3028fc3bcbd6515fa29a7990ee1baa08ee0" -dependencies = [ - "arrow-buffer", - "arrow-schema", - "half", - "num-integer", - "num-traits", -] - -[[package]] -name = "arrow-ipc" -version = "58.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "238438f0834483703d88896db6fe5a7138b2230debc31b34c0336c2996e3c64f" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", - "flatbuffers", - "lz4_flex", - "zstd", -] - -[[package]] -name = "arrow-json" -version = "58.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "205ca2119e6d679d5c133c6f30e68f027738d95ed948cf77677ea69c7800036b" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-ord", - "arrow-schema", - "arrow-select", - "chrono", - "half", - "indexmap", - "itoa", - "lexical-core", - "memchr", - "num-traits", - "ryu", - "serde_core", - "serde_json", - "simdutf8", -] - -[[package]] -name = "arrow-ord" -version = "58.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bffd8fd2579286a5d63bac898159873e5094a79009940bcb42bbfce4f19f1d0" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", -] - -[[package]] -name = "arrow-row" -version = "58.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bab5994731204603c73ba69267616c50f80780774c6bb0476f1f830625115e0c" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "half", -] - -[[package]] -name = "arrow-schema" -version = "58.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f633dbfdf39c039ada1bf9e34c694816eb71fbb7dc78f613993b7245e078a1ed" -dependencies = [ - "bitflags", - "serde_core", - "serde_json", -] - -[[package]] -name = "arrow-select" -version = "58.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8cd065c54172ac787cf3f2f8d4107e0d3fdc26edba76fdf4f4cc170258942222" -dependencies = [ - "ahash", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "num-traits", -] - -[[package]] -name = "arrow-string" -version = "58.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29dd7cda3ab9692f43a2e4acc444d760cc17b12bb6d8232ddf64e9bab7c06b42" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", - "memchr", - "num-traits", - "regex", - "regex-syntax", -] - -[[package]] -name = "as_derive_utils" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff3c96645900a44cf11941c111bd08a6573b0e2f9f69bc9264b179d8fae753c4" -dependencies = [ - "core_extensions", - "proc-macro2", - "quote", - "syn 1.0.109", -] - -[[package]] -name = "async-compression" -version = "0.4.42" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e79b3f8a79cccc2898f31920fc69f304859b3bd567490f75ebf51ae1c792a9ac" -dependencies = [ - "compression-codecs", - "compression-core", - "pin-project-lite", - "tokio", -] - -[[package]] -name = "async-ffi" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4de21c0feef7e5a556e51af767c953f0501f7f300ba785cc99c47bdc8081a50" -dependencies = [ - "abi_stable", -] - -[[package]] -name = "async-trait" -version = "0.1.89" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "atoi" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" -dependencies = [ - "num-traits", -] - -[[package]] -name = "autocfg" -version = "1.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2032f911046de80f0a198e0901378627c33f59ea0ac00e363d481118bd70a53" - -[[package]] -name = "base64" -version = "0.22.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" - -[[package]] -name = "bigdecimal" -version = "0.4.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d6867f1565b3aad85681f1015055b087fcfd840d6aeee6eee7f2da317603695" -dependencies = [ - "autocfg", - "libm", - "num-bigint", - "num-integer", - "num-traits", -] - -[[package]] -name = "bitflags" -version = "2.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8" - -[[package]] -name = "blake2" -version = "0.10.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46502ad458c9a52b69d4d4d32775c788b7a1b85e8bc9d482d92250fc0e3f8efe" -dependencies = [ - "digest", -] - -[[package]] -name = "blake3" -version = "1.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0aa83c34e62843d924f905e0f5c866eb1dd6545fc4d719e803d9ba6030371fce" -dependencies = [ - "arrayref", - "arrayvec", - "cc", - "cfg-if", - "constant_time_eq", - "cpufeatures 0.3.0", -] - -[[package]] -name = "block-buffer" -version = "0.10.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" -dependencies = [ - "generic-array", -] - -[[package]] -name = "brotli" -version = "8.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8119e4516436f5708bbc474a9d395bf12f1b5395e93a92a56e647ac3388c8610" -dependencies = [ - "alloc-no-stdlib", - "alloc-stdlib", - "brotli-decompressor", -] - -[[package]] -name = "brotli-decompressor" -version = "5.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5962523e1b92ce1b5e793d9169b9943eece10d39f62550bc04bb605d75b94924" -dependencies = [ - "alloc-no-stdlib", - "alloc-stdlib", -] - -[[package]] -name = "bumpalo" -version = "3.20.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72f5acc6cb2ba439de613abc23857ec3d78374d8ed5ac84e9d11336e87da8649" - -[[package]] -name = "byteorder" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" - -[[package]] -name = "bytes" -version = "1.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" - -[[package]] -name = "bzip2" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3a53fac24f34a81bc9954b5d6cfce0c21e18ec6959f44f56e8e90e4bb7c346c" -dependencies = [ - "libbz2-rs-sys", -] - -[[package]] -name = "cc" -version = "1.2.63" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "556e016178bb5662a08681bbe0f00f8e17631781a4dfc8c45e466e4b185ec27f" -dependencies = [ - "find-msvc-tools", - "jobserver", - "libc", - "shlex", -] - -[[package]] -name = "cesu8" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c" - -[[package]] -name = "cfg-if" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" - -[[package]] -name = "chrono" -version = "0.4.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1aa79e62e7697b8e29b513a68abacf485adcd1fe8284a4316c5ae868e6633327" -dependencies = [ - "iana-time-zone", - "num-traits", - "windows-link", -] - -[[package]] -name = "chrono-tz" -version = "0.10.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6139a8597ed92cf816dfb33f5dd6cf0bb93a6adc938f11039f371bc5bcd26c3" -dependencies = [ - "chrono", - "phf", -] - -[[package]] -name = "combine" -version = "4.6.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba5a308b75df32fe02788e748662718f03fde005016435c444eea572398219fd" -dependencies = [ - "bytes", - "memchr", -] - -[[package]] -name = "comfy-table" -version = "7.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "958c5d6ecf1f214b4c2bbbbf6ab9523a864bd136dcf71a7e8904799acfe1ad47" -dependencies = [ - "unicode-segmentation", - "unicode-width", -] - -[[package]] -name = "compression-codecs" -version = "0.4.38" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce2548391e9c1929c21bf6aa2680af86fe4c1b33e6cea9ac1cfeec0bd11218cf" -dependencies = [ - "bzip2", - "compression-core", - "flate2", - "liblzma", - "memchr", - "zstd", - "zstd-safe", -] - -[[package]] -name = "compression-core" -version = "0.4.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc14f565cf027a105f7a44ccf9e5b424348421a1d8952a8fc9d499d313107789" - -[[package]] -name = "const-random" -version = "0.1.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359" -dependencies = [ - "const-random-macro", -] - -[[package]] -name = "const-random-macro" -version = "0.1.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" -dependencies = [ - "getrandom 0.2.17", - "once_cell", - "tiny-keccak", -] - -[[package]] -name = "const_panic" -version = "0.2.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e262cdaac42494e3ae34c43969f9cdeb7da178bdb4b66fa6a1ea2edb4c8ae652" -dependencies = [ - "typewit", -] - -[[package]] -name = "constant_time_eq" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" - -[[package]] -name = "core-foundation-sys" -version = "0.8.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" - -[[package]] -name = "core_extensions" -version = "1.5.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42bb5e5d0269fd4f739ea6cedaf29c16d81c27a7ce7582008e90eb50dcd57003" -dependencies = [ - "core_extensions_proc_macros", -] - -[[package]] -name = "core_extensions_proc_macros" -version = "1.5.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "533d38ecd2709b7608fb8e18e4504deb99e9a72879e6aa66373a76d8dc4259ea" - -[[package]] -name = "cpufeatures" -version = "0.2.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" -dependencies = [ - "libc", -] - -[[package]] -name = "cpufeatures" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201" -dependencies = [ - "libc", -] - -[[package]] -name = "crc32fast" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" -dependencies = [ - "cfg-if", -] - -[[package]] -name = "crossbeam-channel" -version = "0.5.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2" -dependencies = [ - "crossbeam-utils", -] - -[[package]] -name = "crossbeam-utils" -version = "0.8.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" - -[[package]] -name = "crunchy" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" - -[[package]] -name = "crypto-common" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" -dependencies = [ - "generic-array", - "typenum", -] - -[[package]] -name = "csv" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938" -dependencies = [ - "csv-core", - "itoa", - "ryu", - "serde_core", -] - -[[package]] -name = "csv-core" -version = "0.1.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782" -dependencies = [ - "memchr", -] - -[[package]] -name = "dashmap" -version = "6.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6361d5c062261c78a176addb82d4c821ae42bed6089de0e12603cd25de2059c" -dependencies = [ - "cfg-if", - "crossbeam-utils", - "hashbrown 0.14.5", - "lock_api", - "once_cell", - "parking_lot_core", -] - -[[package]] -name = "datafusion" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93db0e623840612f7f2cd757f7e8a8922064192363732c88692e0870016e141b" -dependencies = [ - "arrow", - "arrow-schema", - "async-trait", - "bytes", - "bzip2", - "chrono", - "datafusion-catalog", - "datafusion-catalog-listing", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-datasource", - "datafusion-datasource-arrow", - "datafusion-datasource-csv", - "datafusion-datasource-json", - "datafusion-datasource-parquet", - "datafusion-execution", - "datafusion-expr", - "datafusion-expr-common", - "datafusion-functions", - "datafusion-functions-aggregate", - "datafusion-functions-nested", - "datafusion-functions-table", - "datafusion-functions-window", - "datafusion-optimizer", - "datafusion-physical-expr", - "datafusion-physical-expr-adapter", - "datafusion-physical-expr-common", - "datafusion-physical-optimizer", - "datafusion-physical-plan", - "datafusion-session", - "datafusion-sql", - "flate2", - "futures", - "itertools", - "liblzma", - "log", - "object_store", - "parking_lot", - "parquet", - "rand", - "regex", - "sqlparser", - "tempfile", - "tokio", - "url", - "uuid", - "zstd", -] - -[[package]] -name = "datafusion-catalog" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37cefde60b26a7f4ff61e9d2ff2833322f91df2b568d7238afe67bde5bdffb66" -dependencies = [ - "arrow", - "async-trait", - "dashmap", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-physical-expr", - "datafusion-physical-plan", - "datafusion-session", - "futures", - "itertools", - "log", - "object_store", - "parking_lot", - "tokio", -] - -[[package]] -name = "datafusion-catalog-listing" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17e112307715d6a7a331111a4c2330ff54bc237183511c319e3708a4cff431fb" -dependencies = [ - "arrow", - "async-trait", - "datafusion-catalog", - "datafusion-common", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-physical-expr", - "datafusion-physical-expr-adapter", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "futures", - "itertools", - "log", - "object_store", -] - -[[package]] -name = "datafusion-common" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d72a11ca44a95e1081870d3abb80c717496e8a7acb467a1d3e932bb636af5cc2" -dependencies = [ - "ahash", - "arrow", - "arrow-ipc", - "chrono", - "half", - "hashbrown 0.16.1", - "indexmap", - "itertools", - "libc", - "log", - "object_store", - "parquet", - "paste", - "recursive", - "sqlparser", - "tokio", - "web-time", -] - -[[package]] -name = "datafusion-common-runtime" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89f4afaed29670ec4fd6053643adc749fe3f4bc9d1ce1b8c5679b22c67d12def" -dependencies = [ - "futures", - "log", - "tokio", -] - -[[package]] -name = "datafusion-datasource" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9fb386e1691355355a96419978a0022b7947b44d4a24a6ea99f00b6b485cbb6" -dependencies = [ - "arrow", - "async-compression", - "async-trait", - "bytes", - "bzip2", - "chrono", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-execution", - "datafusion-expr", - "datafusion-physical-expr", - "datafusion-physical-expr-adapter", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-session", - "flate2", - "futures", - "glob", - "itertools", - "liblzma", - "log", - "object_store", - "rand", - "tokio", - "tokio-util", - "url", - "zstd", -] - -[[package]] -name = "datafusion-datasource-arrow" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ffa6c52cfed0734c5f93754d1c0175f558175248bf686c944fb05c373e5fc096" -dependencies = [ - "arrow", - "arrow-ipc", - "async-trait", - "bytes", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-session", - "futures", - "itertools", - "object_store", - "tokio", -] - -[[package]] -name = "datafusion-datasource-csv" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "503f29e0582c1fc189578d665ff57d9300da1f80c282777d7eb67bb79fb8cdca" -dependencies = [ - "arrow", - "async-trait", - "bytes", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-session", - "futures", - "object_store", - "regex", - "tokio", -] - -[[package]] -name = "datafusion-datasource-json" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e33804749abc8d0c8cb7473228483cb8070e524c6f6086ee1b85a64debe2b3d2" -dependencies = [ - "arrow", - "async-trait", - "bytes", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-session", - "futures", - "object_store", - "serde_json", - "tokio", - "tokio-stream", -] - -[[package]] -name = "datafusion-datasource-parquet" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a8e0365e0e08e8ff94d912f0ababcf9065a1a304018ba90b1fc83c855b4997" -dependencies = [ - "arrow", - "async-trait", - "bytes", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-functions-aggregate-common", - "datafusion-physical-expr", - "datafusion-physical-expr-adapter", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-pruning", - "datafusion-session", - "futures", - "itertools", - "log", - "object_store", - "parking_lot", - "parquet", - "tokio", -] - -[[package]] -name = "datafusion-doc" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8de6ac0df1662b9148ad3c987978b32cbec7c772f199b1d53520c8fa764a87ee" - -[[package]] -name = "datafusion-execution" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c03c7fbdaefcca4ef6ffe425a5fc2325763bfb426599bb0bf4536466efabe709" -dependencies = [ - "arrow", - "arrow-buffer", - "async-trait", - "chrono", - "dashmap", - "datafusion-common", - "datafusion-expr", - "datafusion-physical-expr-common", - "futures", - "log", - "object_store", - "parking_lot", - "rand", - "tempfile", - "url", -] - -[[package]] -name = "datafusion-expr" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "574b9b6977fedbd2a611cbff12e5caf90f31640ad9dc5870f152836d94bad0dd" -dependencies = [ - "arrow", - "async-trait", - "chrono", - "datafusion-common", - "datafusion-doc", - "datafusion-expr-common", - "datafusion-functions-aggregate-common", - "datafusion-functions-window-common", - "datafusion-physical-expr-common", - "indexmap", - "itertools", - "paste", - "recursive", - "serde_json", - "sqlparser", -] - -[[package]] -name = "datafusion-expr-common" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d7c3adf3db8bf61e92eb90cb659c8e8b734593a8f7c8e12a843c7ddba24b87e" -dependencies = [ - "arrow", - "datafusion-common", - "indexmap", - "itertools", - "paste", -] - -[[package]] -name = "datafusion-ffi" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b95173344d04ba62755c949bf44f8d1a6e4414cf6392a635db96c07e711b9a3c" -dependencies = [ - "abi_stable", - "arrow", - "arrow-schema", - "async-ffi", - "async-trait", - "datafusion-catalog", - "datafusion-common", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-functions-aggregate-common", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-proto", - "datafusion-proto-common", - "datafusion-session", - "futures", - "log", - "prost", - "semver", - "tokio", -] - -[[package]] -name = "datafusion-functions" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f28aa4e10384e782774b10e72aca4d93ef7b31aa653095d9d4536b0a3dbc51b6" -dependencies = [ - "arrow", - "arrow-buffer", - "base64", - "blake2", - "blake3", - "chrono", - "chrono-tz", - "datafusion-common", - "datafusion-doc", - "datafusion-execution", - "datafusion-expr", - "datafusion-expr-common", - "datafusion-macros", - "hex", - "itertools", - "log", - "md-5", - "memchr", - "num-traits", - "rand", - "regex", - "sha2", - "unicode-segmentation", - "uuid", -] - -[[package]] -name = "datafusion-functions-aggregate" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00aa6217e56098ba84e0a338176fe52f0a84cca398021512c6c8c5eff806d0ad" -dependencies = [ - "ahash", - "arrow", - "datafusion-common", - "datafusion-doc", - "datafusion-execution", - "datafusion-expr", - "datafusion-functions-aggregate-common", - "datafusion-macros", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "half", - "log", - "num-traits", - "paste", -] - -[[package]] -name = "datafusion-functions-aggregate-common" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b511250349407db7c43832ab2de63f5557b19a20dfd236b39ca2c04468b50d47" -dependencies = [ - "ahash", - "arrow", - "datafusion-common", - "datafusion-expr-common", - "datafusion-physical-expr-common", -] - -[[package]] -name = "datafusion-functions-nested" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef13a858e20d50f0a9bb5e96e7ac82b4e7597f247515bccca4fdd2992df0212a" -dependencies = [ - "arrow", - "arrow-ord", - "datafusion-common", - "datafusion-doc", - "datafusion-execution", - "datafusion-expr", - "datafusion-expr-common", - "datafusion-functions", - "datafusion-functions-aggregate", - "datafusion-functions-aggregate-common", - "datafusion-macros", - "datafusion-physical-expr-common", - "hashbrown 0.16.1", - "itertools", - "itoa", - "log", - "paste", -] - -[[package]] -name = "datafusion-functions-table" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b40d3f5bbb3905f9ccb1ce9485a9595c77b69758a7c24d3ba79e334ff51e7e" -dependencies = [ - "arrow", - "async-trait", - "datafusion-catalog", - "datafusion-common", - "datafusion-expr", - "datafusion-physical-plan", - "parking_lot", - "paste", -] - -[[package]] -name = "datafusion-functions-window" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4e88ec9d57c9b685d02f58bfee7be62d72610430ddcedb82a08e5d9925dbfb6" -dependencies = [ - "arrow", - "datafusion-common", - "datafusion-doc", - "datafusion-expr", - "datafusion-functions-window-common", - "datafusion-macros", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "log", - "paste", -] - -[[package]] -name = "datafusion-functions-window-common" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8307bb93519b1a91913723a1130cfafeee3f72200d870d88e91a6fc5470ede5c" -dependencies = [ - "datafusion-common", - "datafusion-physical-expr-common", -] - -[[package]] -name = "datafusion-macros" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e367e6a71051d0ebdd29b2f85d12059b38b1d1f172c6906e80016da662226bd" -dependencies = [ - "datafusion-doc", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "datafusion-optimizer" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e929015451a67f77d9d8b727b2bf3a40c4445fdef6cdc53281d7d97c76888ace" -dependencies = [ - "arrow", - "chrono", - "datafusion-common", - "datafusion-expr", - "datafusion-expr-common", - "datafusion-physical-expr", - "indexmap", - "itertools", - "log", - "recursive", - "regex", - "regex-syntax", -] - -[[package]] -name = "datafusion-physical-expr" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b1e68aba7a4b350401cfdf25a3d6f989ad898a7410164afe9ca52080244cb59" -dependencies = [ - "ahash", - "arrow", - "datafusion-common", - "datafusion-expr", - "datafusion-expr-common", - "datafusion-functions-aggregate-common", - "datafusion-physical-expr-common", - "half", - "hashbrown 0.16.1", - "indexmap", - "itertools", - "parking_lot", - "paste", - "petgraph", - "recursive", - "tokio", -] - -[[package]] -name = "datafusion-physical-expr-adapter" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea22315f33cf2e0adc104e8ec42e285f6ed93998d565c65e82fec6a9ee9f9db4" -dependencies = [ - "arrow", - "datafusion-common", - "datafusion-expr", - "datafusion-functions", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "itertools", -] - -[[package]] -name = "datafusion-physical-expr-common" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b04b45ea8ad3ac2d78f2ea2a76053e06591c9629c7a603eda16c10649ecf4362" -dependencies = [ - "ahash", - "arrow", - "chrono", - "datafusion-common", - "datafusion-expr-common", - "hashbrown 0.16.1", - "indexmap", - "itertools", - "parking_lot", -] - -[[package]] -name = "datafusion-physical-optimizer" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7cb13397809a425918f608dfe8653f332015a3e330004ab191b4404187238b95" -dependencies = [ - "arrow", - "datafusion-common", - "datafusion-execution", - "datafusion-expr", - "datafusion-expr-common", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-pruning", - "itertools", - "recursive", -] - -[[package]] -name = "datafusion-physical-plan" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5edc023675791af9d5fb4cc4c24abf5f7bd3bd4dcf9e5bd90ea1eff6976dcc79" -dependencies = [ - "ahash", - "arrow", - "arrow-ord", - "arrow-schema", - "async-trait", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-execution", - "datafusion-expr", - "datafusion-functions", - "datafusion-functions-aggregate-common", - "datafusion-functions-window-common", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "futures", - "half", - "hashbrown 0.16.1", - "indexmap", - "itertools", - "log", - "num-traits", - "parking_lot", - "pin-project-lite", - "tokio", -] - -[[package]] -name = "datafusion-proto" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a387aaef949dc16bb6abc81bd1af850ec7449183aef011214f9724957495738" -dependencies = [ - "arrow", - "chrono", - "datafusion-catalog", - "datafusion-catalog-listing", - "datafusion-common", - "datafusion-datasource", - "datafusion-datasource-arrow", - "datafusion-datasource-csv", - "datafusion-datasource-json", - "datafusion-datasource-parquet", - "datafusion-execution", - "datafusion-expr", - "datafusion-functions-table", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-proto-common", - "object_store", - "prost", - "rand", -] - -[[package]] -name = "datafusion-proto-common" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16e614c7c53a9c304c6a850b821010bb492e57300311835f1180613f9d2c63d9" -dependencies = [ - "arrow", - "datafusion-common", - "prost", -] - -[[package]] -name = "datafusion-pruning" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac8c76860e355616555081cab5968cec1af7a80701ff374510860bcd567e365a" -dependencies = [ - "arrow", - "datafusion-common", - "datafusion-datasource", - "datafusion-expr-common", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "itertools", - "log", -] - -[[package]] -name = "datafusion-session" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5412111aa48e2424ba926112e192f7a6b7e4ccb450145d25ce5ede9f19dc491e" -dependencies = [ - "async-trait", - "datafusion-common", - "datafusion-execution", - "datafusion-expr", - "datafusion-physical-plan", - "parking_lot", -] - -[[package]] -name = "datafusion-spark-helper" -version = "0.1.0" -dependencies = [ - "arrow", - "async-trait", - "datafusion", - "datafusion-ffi", - "futures", - "jni", - "tokio", -] - -[[package]] -name = "datafusion-sql" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa0d133ddf8b9b3b872acac900157f783e7b879fe9a6bccf389abebbfac45ec1" -dependencies = [ - "arrow", - "bigdecimal", - "chrono", - "datafusion-common", - "datafusion-expr", - "datafusion-functions-nested", - "indexmap", - "log", - "recursive", - "regex", - "sqlparser", -] - -[[package]] -name = "digest" -version = "0.10.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" -dependencies = [ - "block-buffer", - "crypto-common", - "subtle", -] - -[[package]] -name = "displaydoc" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ac70aa55017e108007fbaf5aa0f54b021c98f92ff8af59d42eda9da96e3dd4f" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "either" -version = "1.16.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e" - -[[package]] -name = "equivalent" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" - -[[package]] -name = "errno" -version = "0.3.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" -dependencies = [ - "libc", - "windows-sys 0.61.2", -] - -[[package]] -name = "fastrand" -version = "2.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" - -[[package]] -name = "find-msvc-tools" -version = "0.1.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" - -[[package]] -name = "fixedbitset" -version = "0.5.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" - -[[package]] -name = "flatbuffers" -version = "25.12.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35f6839d7b3b98adde531effaf34f0c2badc6f4735d26fe74709d8e513a96ef3" -dependencies = [ - "bitflags", - "rustc_version", -] - -[[package]] -name = "flate2" -version = "1.1.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" -dependencies = [ - "crc32fast", - "miniz_oxide", - "zlib-rs", -] - -[[package]] -name = "foldhash" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" - -[[package]] -name = "foldhash" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" - -[[package]] -name = "form_urlencoded" -version = "1.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf" -dependencies = [ - "percent-encoding", -] - -[[package]] -name = "futures" -version = "0.3.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d" -dependencies = [ - "futures-channel", - "futures-core", - "futures-executor", - "futures-io", - "futures-sink", - "futures-task", - "futures-util", -] - -[[package]] -name = "futures-channel" -version = "0.3.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d" -dependencies = [ - "futures-core", - "futures-sink", -] - -[[package]] -name = "futures-core" -version = "0.3.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" - -[[package]] -name = "futures-executor" -version = "0.3.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d" -dependencies = [ - "futures-core", - "futures-task", - "futures-util", -] - -[[package]] -name = "futures-io" -version = "0.3.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718" - -[[package]] -name = "futures-macro" -version = "0.3.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "futures-sink" -version = "0.3.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893" - -[[package]] -name = "futures-task" -version = "0.3.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" - -[[package]] -name = "futures-util" -version = "0.3.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" -dependencies = [ - "futures-channel", - "futures-core", - "futures-io", - "futures-macro", - "futures-sink", - "futures-task", - "memchr", - "pin-project-lite", - "slab", -] - -[[package]] -name = "generational-arena" -version = "0.2.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "877e94aff08e743b651baaea359664321055749b398adff8740a7399af7796e7" -dependencies = [ - "cfg-if", -] - -[[package]] -name = "generic-array" -version = "0.14.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" -dependencies = [ - "typenum", - "version_check", -] - -[[package]] -name = "getrandom" -version = "0.2.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" -dependencies = [ - "cfg-if", - "libc", - "wasi", -] - -[[package]] -name = "getrandom" -version = "0.3.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" -dependencies = [ - "cfg-if", - "libc", - "r-efi 5.3.0", - "wasip2", -] - -[[package]] -name = "getrandom" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" -dependencies = [ - "cfg-if", - "libc", - "r-efi 6.0.0", - "wasip2", - "wasip3", -] - -[[package]] -name = "glob" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" - -[[package]] -name = "half" -version = "2.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" -dependencies = [ - "cfg-if", - "crunchy", - "num-traits", - "zerocopy", -] - -[[package]] -name = "hashbrown" -version = "0.14.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" - -[[package]] -name = "hashbrown" -version = "0.15.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" -dependencies = [ - "foldhash 0.1.5", -] - -[[package]] -name = "hashbrown" -version = "0.16.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" -dependencies = [ - "allocator-api2", - "equivalent", - "foldhash 0.2.0", -] - -[[package]] -name = "hashbrown" -version = "0.17.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" - -[[package]] -name = "heck" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" - -[[package]] -name = "hex" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" - -[[package]] -name = "http" -version = "1.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6970f50e31d6fc17d3fa27329444bfa74e196cf62e95052a3f6fee181dba6425" -dependencies = [ - "bytes", - "itoa", -] - -[[package]] -name = "humantime" -version = "2.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424" - -[[package]] -name = "iana-time-zone" -version = "0.1.65" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e31bc9ad994ba00e440a8aa5c9ef0ec67d5cb5e5cb0cc7f8b744a35b389cc470" -dependencies = [ - "android_system_properties", - "core-foundation-sys", - "iana-time-zone-haiku", - "js-sys", - "log", - "wasm-bindgen", - "windows-core", -] - -[[package]] -name = "iana-time-zone-haiku" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" -dependencies = [ - "cc", -] - -[[package]] -name = "icu_collections" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2984d1cd16c883d7935b9e07e44071dca8d917fd52ecc02c04d5fa0b5a3f191c" -dependencies = [ - "displaydoc", - "potential_utf", - "utf8_iter", - "yoke", - "zerofrom", - "zerovec", -] - -[[package]] -name = "icu_locale_core" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92219b62b3e2b4d88ac5119f8904c10f8f61bf7e95b640d25ba3075e6cac2c29" -dependencies = [ - "displaydoc", - "litemap", - "tinystr", - "writeable", - "zerovec", -] - -[[package]] -name = "icu_normalizer" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c56e5ee99d6e3d33bd91c5d85458b6005a22140021cc324cea84dd0e72cff3b4" -dependencies = [ - "icu_collections", - "icu_normalizer_data", - "icu_properties", - "icu_provider", - "smallvec", - "zerovec", -] - -[[package]] -name = "icu_normalizer_data" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da3be0ae77ea334f4da67c12f149704f19f81d1adf7c51cf482943e84a2bad38" - -[[package]] -name = "icu_properties" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bee3b67d0ea5c2cca5003417989af8996f8604e34fb9ddf96208a033901e70de" -dependencies = [ - "icu_collections", - "icu_locale_core", - "icu_properties_data", - "icu_provider", - "zerotrie", - "zerovec", -] - -[[package]] -name = "icu_properties_data" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e2bbb201e0c04f7b4b3e14382af113e17ba4f63e2c9d2ee626b720cbce54a14" - -[[package]] -name = "icu_provider" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "139c4cf31c8b5f33d7e199446eff9c1e02decfc2f0eec2c8d71f65befa45b421" -dependencies = [ - "displaydoc", - "icu_locale_core", - "writeable", - "yoke", - "zerofrom", - "zerotrie", - "zerovec", -] - -[[package]] -name = "id-arena" -version = "2.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" - -[[package]] -name = "idna" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de" -dependencies = [ - "idna_adapter", - "smallvec", - "utf8_iter", -] - -[[package]] -name = "idna_adapter" -version = "1.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb68373c0d6620ef8105e855e7745e18b0d00d3bdb07fb532e434244cdb9a714" -dependencies = [ - "icu_normalizer", - "icu_properties", -] - -[[package]] -name = "indexmap" -version = "2.14.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" -dependencies = [ - "equivalent", - "hashbrown 0.17.1", - "serde", - "serde_core", -] - -[[package]] -name = "integer-encoding" -version = "3.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" - -[[package]] -name = "itertools" -version = "0.14.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" -dependencies = [ - "either", -] - -[[package]] -name = "itoa" -version = "1.0.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" - -[[package]] -name = "jni" -version = "0.21.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a87aa2bb7d2af34197c04845522473242e1aa17c12f4935d5856491a7fb8c97" -dependencies = [ - "cesu8", - "cfg-if", - "combine", - "jni-sys 0.3.1", - "log", - "thiserror 1.0.69", - "walkdir", - "windows-sys 0.45.0", -] - -[[package]] -name = "jni-sys" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41a652e1f9b6e0275df1f15b32661cf0d4b78d4d87ddec5e0c3c20f097433258" -dependencies = [ - "jni-sys 0.4.1", -] - -[[package]] -name = "jni-sys" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6377a88cb3910bee9b0fa88d4f42e1d2da8e79915598f65fb0c7ee14c878af2" -dependencies = [ - "jni-sys-macros", -] - -[[package]] -name = "jni-sys-macros" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38c0b942f458fe50cdac086d2f946512305e5631e720728f2a61aabcd47a6264" -dependencies = [ - "quote", - "syn 2.0.117", -] - -[[package]] -name = "jobserver" -version = "0.1.34" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" -dependencies = [ - "getrandom 0.3.4", - "libc", -] - -[[package]] -name = "js-sys" -version = "0.3.100" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2025f20d7a4fa7785846e7b63d10a76d3f1cee98ee5cb79ea59703f95e42162" -dependencies = [ - "cfg-if", - "futures-util", - "wasm-bindgen", -] - -[[package]] -name = "leb128fmt" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" - -[[package]] -name = "lexical-core" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d8d125a277f807e55a77304455eb7b1cb52f2b18c143b60e766c120bd64a594" -dependencies = [ - "lexical-parse-float", - "lexical-parse-integer", - "lexical-util", - "lexical-write-float", - "lexical-write-integer", -] - -[[package]] -name = "lexical-parse-float" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52a9f232fbd6f550bc0137dcb5f99ab674071ac2d690ac69704593cb4abbea56" -dependencies = [ - "lexical-parse-integer", - "lexical-util", -] - -[[package]] -name = "lexical-parse-integer" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a7a039f8fb9c19c996cd7b2fcce303c1b2874fe1aca544edc85c4a5f8489b34" -dependencies = [ - "lexical-util", -] - -[[package]] -name = "lexical-util" -version = "1.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2604dd126bb14f13fb5d1bd6a66155079cb9fa655b37f875b3a742c705dbed17" - -[[package]] -name = "lexical-write-float" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50c438c87c013188d415fbabbb1dceb44249ab81664efbd31b14ae55dabb6361" -dependencies = [ - "lexical-util", - "lexical-write-integer", -] - -[[package]] -name = "lexical-write-integer" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "409851a618475d2d5796377cad353802345cba92c867d9fbcde9cf4eac4e14df" -dependencies = [ - "lexical-util", -] - -[[package]] -name = "libbz2-rs-sys" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34b357333733e8260735ba5894eb928c02ecc69c78715f01a8019e7fa7f2db4c" - -[[package]] -name = "libc" -version = "0.2.186" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" - -[[package]] -name = "libloading" -version = "0.7.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" -dependencies = [ - "cfg-if", - "winapi", -] - -[[package]] -name = "liblzma" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6033b77c21d1f56deeae8014eb9fbe7bdf1765185a6c508b5ca82eeaed7f899" -dependencies = [ - "liblzma-sys", -] - -[[package]] -name = "liblzma-sys" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a60851d15cd8c5346eca4ab8babff585be2ae4bc8097c067291d3ffe2add3b6" -dependencies = [ - "cc", - "libc", - "pkg-config", -] - -[[package]] -name = "libm" -version = "0.2.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" - -[[package]] -name = "linux-raw-sys" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" - -[[package]] -name = "litemap" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0" - -[[package]] -name = "lock_api" -version = "0.4.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" -dependencies = [ - "scopeguard", -] - -[[package]] -name = "log" -version = "0.4.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "953f07c43838f8e6f9758cab68bf5bed85465e7587ebe0b823f1bcd81978ad3a" - -[[package]] -name = "lz4_flex" -version = "0.13.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ef0d4ed8669f8f8826eb00dc878084aa8f253506c4fd5e8f58f5bce72ddb97e" -dependencies = [ - "twox-hash", -] - -[[package]] -name = "md-5" -version = "0.10.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" -dependencies = [ - "cfg-if", - "digest", -] - -[[package]] -name = "memchr" -version = "2.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8" - -[[package]] -name = "miniz_oxide" -version = "0.8.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" -dependencies = [ - "adler2", - "simd-adler32", -] - -[[package]] -name = "num-bigint" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" -dependencies = [ - "num-integer", - "num-traits", -] - -[[package]] -name = "num-complex" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" -dependencies = [ - "num-traits", -] - -[[package]] -name = "num-integer" -version = "0.1.46" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" -dependencies = [ - "num-traits", -] - -[[package]] -name = "num-traits" -version = "0.2.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" -dependencies = [ - "autocfg", - "libm", -] - -[[package]] -name = "object" -version = "0.37.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe" -dependencies = [ - "memchr", -] - -[[package]] -name = "object_store" -version = "0.13.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "622acbc9100d3c10e2ee15804b0caa40e55c933d5aa53814cd520805b7958a49" -dependencies = [ - "async-trait", - "bytes", - "chrono", - "futures-channel", - "futures-core", - "futures-util", - "http", - "humantime", - "itertools", - "parking_lot", - "percent-encoding", - "thiserror 2.0.18", - "tokio", - "tracing", - "url", - "walkdir", - "wasm-bindgen-futures", - "web-time", -] - -[[package]] -name = "once_cell" -version = "1.21.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" - -[[package]] -name = "ordered-float" -version = "2.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c" -dependencies = [ - "num-traits", -] - -[[package]] -name = "parking_lot" -version = "0.12.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" -dependencies = [ - "lock_api", - "parking_lot_core", -] - -[[package]] -name = "parking_lot_core" -version = "0.9.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" -dependencies = [ - "cfg-if", - "libc", - "redox_syscall", - "smallvec", - "windows-link", -] - -[[package]] -name = "parquet" -version = "58.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dafa7d01085b62a47dd0c1829550a0a36710ea9c4fe358a05a85477cec8a908" -dependencies = [ - "ahash", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-ipc", - "arrow-schema", - "arrow-select", - "base64", - "brotli", - "bytes", - "chrono", - "flate2", - "futures", - "half", - "hashbrown 0.17.1", - "lz4_flex", - "num-bigint", - "num-integer", - "num-traits", - "object_store", - "paste", - "seq-macro", - "simdutf8", - "snap", - "thrift", - "tokio", - "twox-hash", - "zstd", -] - -[[package]] -name = "paste" -version = "1.0.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" - -[[package]] -name = "percent-encoding" -version = "2.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" - -[[package]] -name = "petgraph" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455" -dependencies = [ - "fixedbitset", - "hashbrown 0.15.5", - "indexmap", - "serde", -] - -[[package]] -name = "phf" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "913273894cec178f401a31ec4b656318d95473527be05c0752cc41cdc32be8b7" -dependencies = [ - "phf_shared", -] - -[[package]] -name = "phf_shared" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06005508882fb681fd97892ecff4b7fd0fee13ef1aa569f8695dae7ab9099981" -dependencies = [ - "siphasher", -] - -[[package]] -name = "pin-project-lite" -version = "0.2.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" - -[[package]] -name = "pkg-config" -version = "0.3.33" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e" - -[[package]] -name = "potential_utf" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0103b1cef7ec0cf76490e969665504990193874ea05c85ff9bab8b911d0a0564" -dependencies = [ - "zerovec", -] - -[[package]] -name = "ppv-lite86" -version = "0.2.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" -dependencies = [ - "zerocopy", -] - -[[package]] -name = "prettyplease" -version = "0.2.37" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" -dependencies = [ - "proc-macro2", - "syn 2.0.117", -] - -[[package]] -name = "proc-macro2" -version = "1.0.106" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" -dependencies = [ - "unicode-ident", -] - -[[package]] -name = "prost" -version = "0.14.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "528ac67416ff8646872a3c02cad9cc4ee5dc9f9540c9b10771855c95cb2e5ae1" -dependencies = [ - "bytes", - "prost-derive", -] - -[[package]] -name = "prost-derive" -version = "0.14.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b570b25f7617e43d59005d0990ccb79e950a423952cea19671b7a876da390adf" -dependencies = [ - "anyhow", - "itertools", - "proc-macro2", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "psm" -version = "0.1.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "645dbe486e346d9b5de3ef16ede18c26e6c70ad97418f4874b8b1889d6e761ea" -dependencies = [ - "ar_archive_writer", - "cc", -] - -[[package]] -name = "quote" -version = "1.0.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" -dependencies = [ - "proc-macro2", -] - -[[package]] -name = "r-efi" -version = "5.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" - -[[package]] -name = "r-efi" -version = "6.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" - -[[package]] -name = "rand" -version = "0.9.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" -dependencies = [ - "rand_chacha", - "rand_core", -] - -[[package]] -name = "rand_chacha" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" -dependencies = [ - "ppv-lite86", - "rand_core", -] - -[[package]] -name = "rand_core" -version = "0.9.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" -dependencies = [ - "getrandom 0.3.4", -] - -[[package]] -name = "recursive" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e" -dependencies = [ - "recursive-proc-macro-impl", - "stacker", -] - -[[package]] -name = "recursive-proc-macro-impl" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" -dependencies = [ - "quote", - "syn 2.0.117", -] - -[[package]] -name = "redox_syscall" -version = "0.5.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" -dependencies = [ - "bitflags", -] - -[[package]] -name = "regex" -version = "1.12.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1292b7759ae1cb9ec195452d1390a074f0cd8541ab7a5a8c31cd6db45d4a6ba" -dependencies = [ - "aho-corasick", - "memchr", - "regex-automata", - "regex-syntax", -] - -[[package]] -name = "regex-automata" -version = "0.4.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" -dependencies = [ - "aho-corasick", - "memchr", - "regex-syntax", -] - -[[package]] -name = "regex-syntax" -version = "0.8.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4" - -[[package]] -name = "repr_offset" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb1070755bd29dffc19d0971cab794e607839ba2ef4b69a9e6fbc8733c1b72ea" -dependencies = [ - "tstr", -] - -[[package]] -name = "rustc_version" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" -dependencies = [ - "semver", -] - -[[package]] -name = "rustix" -version = "1.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" -dependencies = [ - "bitflags", - "errno", - "libc", - "linux-raw-sys", - "windows-sys 0.61.2", -] - -[[package]] -name = "rustversion" -version = "1.0.22" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" - -[[package]] -name = "ryu" -version = "1.0.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" - -[[package]] -name = "same-file" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" -dependencies = [ - "winapi-util", -] - -[[package]] -name = "scopeguard" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" - -[[package]] -name = "semver" -version = "1.0.28" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" - -[[package]] -name = "seq-macro" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc" - -[[package]] -name = "serde" -version = "1.0.228" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" -dependencies = [ - "serde_core", - "serde_derive", -] - -[[package]] -name = "serde_core" -version = "1.0.228" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" -dependencies = [ - "serde_derive", -] - -[[package]] -name = "serde_derive" -version = "1.0.228" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "serde_json" -version = "1.0.150" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9" -dependencies = [ - "itoa", - "memchr", - "serde", - "serde_core", - "zmij", -] - -[[package]] -name = "sha2" -version = "0.10.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" -dependencies = [ - "cfg-if", - "cpufeatures 0.2.17", - "digest", -] - -[[package]] -name = "shlex" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8fadd59c855ef2080decdef8ff161eb6661b86933c9d82e5ba29dc602a55aba" - -[[package]] -name = "simd-adler32" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214" - -[[package]] -name = "simdutf8" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" - -[[package]] -name = "siphasher" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ee5873ec9cce0195efcb7a4e9507a04cd49aec9c83d0389df45b1ef7ba2e649" - -[[package]] -name = "slab" -version = "0.4.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" - -[[package]] -name = "smallvec" -version = "1.15.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" - -[[package]] -name = "snap" -version = "1.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" - -[[package]] -name = "sqlparser" -version = "0.61.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbf5ea8d4d7c808e1af1cbabebca9a2abe603bcefc22294c5b95018d53200cb7" -dependencies = [ - "log", - "recursive", - "sqlparser_derive", -] - -[[package]] -name = "sqlparser_derive" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6dd45d8fc1c79299bfbb7190e42ccbbdf6a5f52e4a6ad98d92357ea965bd289" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "stable_deref_trait" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" - -[[package]] -name = "stacker" -version = "0.1.24" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "640c8cdd92b6b12f5bcb1803ca3bbf5ab96e5e6b6b96b9ab77dabe9e880b3190" -dependencies = [ - "cc", - "cfg-if", - "libc", - "psm", - "windows-sys 0.61.2", -] - -[[package]] -name = "subtle" -version = "2.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" - -[[package]] -name = "syn" -version = "1.0.109" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - -[[package]] -name = "syn" -version = "2.0.117" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - -[[package]] -name = "synstructure" -version = "0.13.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "tempfile" -version = "3.27.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" -dependencies = [ - "fastrand", - "getrandom 0.4.2", - "once_cell", - "rustix", - "windows-sys 0.61.2", -] - -[[package]] -name = "thiserror" -version = "1.0.69" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" -dependencies = [ - "thiserror-impl 1.0.69", -] - -[[package]] -name = "thiserror" -version = "2.0.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" -dependencies = [ - "thiserror-impl 2.0.18", -] - -[[package]] -name = "thiserror-impl" -version = "1.0.69" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "thiserror-impl" -version = "2.0.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "thrift" -version = "0.17.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" -dependencies = [ - "byteorder", - "integer-encoding", - "ordered-float", -] - -[[package]] -name = "tiny-keccak" -version = "2.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" -dependencies = [ - "crunchy", -] - -[[package]] -name = "tinystr" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8323304221c2a851516f22236c5722a72eaa19749016521d6dff0824447d96d" -dependencies = [ - "displaydoc", - "zerovec", -] - -[[package]] -name = "tokio" -version = "1.52.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fc7f01b389ac15039e4dc9531aa973a135d7a4135281b12d7c1bc79fd57fffe" -dependencies = [ - "bytes", - "pin-project-lite", - "tokio-macros", -] - -[[package]] -name = "tokio-macros" -version = "2.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "tokio-stream" -version = "0.1.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32da49809aab5c3bc678af03902d4ccddea2a87d028d86392a4b1560c6906c70" -dependencies = [ - "futures-core", - "pin-project-lite", - "tokio", - "tokio-util", -] - -[[package]] -name = "tokio-util" -version = "0.7.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098" -dependencies = [ - "bytes", - "futures-core", - "futures-sink", - "pin-project-lite", - "tokio", -] - -[[package]] -name = "tracing" -version = "0.1.44" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" -dependencies = [ - "pin-project-lite", - "tracing-attributes", - "tracing-core", -] - -[[package]] -name = "tracing-attributes" -version = "0.1.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "tracing-core" -version = "0.1.36" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" -dependencies = [ - "once_cell", -] - -[[package]] -name = "tstr" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f8e0294f14baae476d0dd0a2d780b2e24d66e349a9de876f5126777a37bdba7" -dependencies = [ - "tstr_proc_macros", -] - -[[package]] -name = "tstr_proc_macros" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e78122066b0cb818b8afd08f7ed22f7fdbc3e90815035726f0840d0d26c0747a" - -[[package]] -name = "twox-hash" -version = "2.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c" - -[[package]] -name = "typed-arena" -version = "2.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6af6ae20167a9ece4bcb41af5b80f8a1f1df981f6391189ce00fd257af04126a" - -[[package]] -name = "typenum" -version = "1.20.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6f5e870be6c3b371b77fe0ee0bafb859fa4964b4404c27de1d380043c4dda20" - -[[package]] -name = "typewit" -version = "1.15.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "214ca0b2191785cbc06209b9ca1861e048e39b5ba33574b3cedd58363d5bb5f6" - -[[package]] -name = "unicode-ident" -version = "1.0.24" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" - -[[package]] -name = "unicode-segmentation" -version = "1.13.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6f5d3c3b1bf09027a88a6bc961fc00497d651009560b5463668dc81b0fa87a8" - -[[package]] -name = "unicode-width" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" - -[[package]] -name = "unicode-xid" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" - -[[package]] -name = "url" -version = "2.5.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed" -dependencies = [ - "form_urlencoded", - "idna", - "percent-encoding", - "serde", -] - -[[package]] -name = "utf8_iter" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" - -[[package]] -name = "uuid" -version = "1.23.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "144d6b123cef80b301b8f72a9e2ca4370ddec21950d0a103dd22c437006d2db7" -dependencies = [ - "getrandom 0.4.2", - "js-sys", - "wasm-bindgen", -] - -[[package]] -name = "version_check" -version = "0.9.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" - -[[package]] -name = "walkdir" -version = "2.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" -dependencies = [ - "same-file", - "winapi-util", -] - -[[package]] -name = "wasi" -version = "0.11.1+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" - -[[package]] -name = "wasip2" -version = "1.0.3+wasi-0.2.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6" -dependencies = [ - "wit-bindgen 0.57.1", -] - -[[package]] -name = "wasip3" -version = "0.4.0+wasi-0.3.0-rc-2026-01-06" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" -dependencies = [ - "wit-bindgen 0.51.0", -] - -[[package]] -name = "wasm-bindgen" -version = "0.2.123" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a254a4b10c19a76f09a27640e7ffbf9bc30bf67e16a3bf28aaefa4920fe81563" -dependencies = [ - "cfg-if", - "once_cell", - "rustversion", - "wasm-bindgen-macro", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-futures" -version = "0.4.73" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54568702fabf5d4849ce2b90fadfa64168a097eaf4b351ce9df8b687a0086aaf" -dependencies = [ - "js-sys", - "wasm-bindgen", -] - -[[package]] -name = "wasm-bindgen-macro" -version = "0.2.123" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24a40fc75b0ec6f3746ceb10d36f53a93dcd68a93b11b6445983945d79eba0dc" -dependencies = [ - "quote", - "wasm-bindgen-macro-support", -] - -[[package]] -name = "wasm-bindgen-macro-support" -version = "0.2.123" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "908f34bd9b9ce3d4caf07b72dfab63d61504d156856c6bd3cd87fa350cf3985b" -dependencies = [ - "bumpalo", - "proc-macro2", - "quote", - "syn 2.0.117", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-shared" -version = "0.2.123" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7acbf7616c27b194bbb550bf77ed0c2c3e5b7fd1260a93082b95fb7f47959b92" -dependencies = [ - "unicode-ident", -] - -[[package]] -name = "wasm-encoder" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" -dependencies = [ - "leb128fmt", - "wasmparser", -] - -[[package]] -name = "wasm-metadata" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" -dependencies = [ - "anyhow", - "indexmap", - "wasm-encoder", - "wasmparser", -] - -[[package]] -name = "wasmparser" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" -dependencies = [ - "bitflags", - "hashbrown 0.15.5", - "indexmap", - "semver", -] - -[[package]] -name = "web-time" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" -dependencies = [ - "js-sys", - "wasm-bindgen", -] - -[[package]] -name = "winapi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" -dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", -] - -[[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" - -[[package]] -name = "winapi-util" -version = "0.1.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" -dependencies = [ - "windows-sys 0.61.2", -] - -[[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" - -[[package]] -name = "windows-core" -version = "0.62.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" -dependencies = [ - "windows-implement", - "windows-interface", - "windows-link", - "windows-result", - "windows-strings", -] - -[[package]] -name = "windows-implement" -version = "0.60.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "windows-interface" -version = "0.59.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "windows-link" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" - -[[package]] -name = "windows-result" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" -dependencies = [ - "windows-link", -] - -[[package]] -name = "windows-strings" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" -dependencies = [ - "windows-link", -] - -[[package]] -name = "windows-sys" -version = "0.45.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" -dependencies = [ - "windows-targets", -] - -[[package]] -name = "windows-sys" -version = "0.61.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" -dependencies = [ - "windows-link", -] - -[[package]] -name = "windows-targets" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" -dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", -] - -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" - -[[package]] -name = "windows_aarch64_msvc" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" - -[[package]] -name = "windows_i686_gnu" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" - -[[package]] -name = "windows_i686_msvc" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" - -[[package]] -name = "windows_x86_64_gnu" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" - -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" - -[[package]] -name = "windows_x86_64_msvc" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" - -[[package]] -name = "wit-bindgen" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" -dependencies = [ - "wit-bindgen-rust-macro", -] - -[[package]] -name = "wit-bindgen" -version = "0.57.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" - -[[package]] -name = "wit-bindgen-core" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" -dependencies = [ - "anyhow", - "heck", - "wit-parser", -] - -[[package]] -name = "wit-bindgen-rust" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" -dependencies = [ - "anyhow", - "heck", - "indexmap", - "prettyplease", - "syn 2.0.117", - "wasm-metadata", - "wit-bindgen-core", - "wit-component", -] - -[[package]] -name = "wit-bindgen-rust-macro" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" -dependencies = [ - "anyhow", - "prettyplease", - "proc-macro2", - "quote", - "syn 2.0.117", - "wit-bindgen-core", - "wit-bindgen-rust", -] - -[[package]] -name = "wit-component" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" -dependencies = [ - "anyhow", - "bitflags", - "indexmap", - "log", - "serde", - "serde_derive", - "serde_json", - "wasm-encoder", - "wasm-metadata", - "wasmparser", - "wit-parser", -] - -[[package]] -name = "wit-parser" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" -dependencies = [ - "anyhow", - "id-arena", - "indexmap", - "log", - "semver", - "serde", - "serde_derive", - "serde_json", - "unicode-xid", - "wasmparser", -] - -[[package]] -name = "writeable" -version = "0.6.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" - -[[package]] -name = "yoke" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "709fe23a0424b6a435d82152b1bd3fdfb0833487d5fa90d05d42762a9891fef5" -dependencies = [ - "stable_deref_trait", - "yoke-derive", - "zerofrom", -] - -[[package]] -name = "yoke-derive" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", - "synstructure", -] - -[[package]] -name = "zerocopy" -version = "0.8.52" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce1022995ff5ff5d841ad7d994facc23098cd40152f2c1d11cd607c6f530653f" -dependencies = [ - "zerocopy-derive", -] - -[[package]] -name = "zerocopy-derive" -version = "0.8.52" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ae7f38b72ec2a254e2b87ef277cf2cd4fb97cbebf944faa6f33354da0867930" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "zerofrom" -version = "0.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ec05a11813ea801ff6d75110ad09cd0824ddba17dfe17128ea0d5f68e6c5272" -dependencies = [ - "zerofrom-derive", -] - -[[package]] -name = "zerofrom-derive" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", - "synstructure", -] - -[[package]] -name = "zerotrie" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f9152d31db0792fa83f70fb2f83148effb5c1f5b8c7686c3459e361d9bc20bf" -dependencies = [ - "displaydoc", - "yoke", - "zerofrom", -] - -[[package]] -name = "zerovec" -version = "0.11.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90f911cbc359ab6af17377d242225f4d75119aec87ea711a880987b18cd7b239" -dependencies = [ - "yoke", - "zerofrom", - "zerovec-derive", -] - -[[package]] -name = "zerovec-derive" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "zlib-rs" -version = "0.6.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513" - -[[package]] -name = "zmij" -version = "1.0.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" - -[[package]] -name = "zstd" -version = "0.13.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" -dependencies = [ - "zstd-safe", -] - -[[package]] -name = "zstd-safe" -version = "7.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" -dependencies = [ - "zstd-sys", -] - -[[package]] -name = "zstd-sys" -version = "2.0.16+zstd.1.5.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" -dependencies = [ - "cc", - "pkg-config", -] diff --git a/spark/native/Cargo.toml b/spark/native/Cargo.toml index 4650b2a..5f42bf2 100644 --- a/spark/native/Cargo.toml +++ b/spark/native/Cargo.toml @@ -20,10 +20,10 @@ publish = false crate-type = ["cdylib", "rlib"] [dependencies] -arrow = { version = "58", features = ["ffi"] } -async-trait = "0.1" -datafusion = { version = "53.1.0" } -datafusion-ffi = "53.1.0" -futures = "0.3" -jni = "0.21" -tokio = { version = "1", features = ["rt-multi-thread"] } +arrow = { workspace = true } +async-trait = { workspace = true } +datafusion = { workspace = true } +datafusion-ffi = { workspace = true } +futures = { workspace = true } +jni = { workspace = true } +tokio = { workspace = true } diff --git a/spark/pom.xml b/spark/pom.xml index 82586ff..05355de 100644 --- a/spark/pom.xml +++ b/spark/pom.xml @@ -140,8 +140,8 @@ under the License. - + value="${maven.multiModuleProjectDirectory}/rust-target/${datafusion.native.profile}/${datafusion.spark.helper.filename}"/> + From 8db9d4ad4303fccb81f959421f5d17d547d34394 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 10 Jun 2026 16:55:50 +0200 Subject: [PATCH 05/22] feat(examples): pass user options through FFI table provider demo Demonstrate how Spark DataSource V2 options flow through the FfiProviderFactory into the native MemTable build. The example factory now accepts name_prefix, num_rows, and num_batches, encodes them as a small length-prefixed binary blob, and the Rust cdylib decodes the blob to size the generated table accordingly. Co-Authored-By: Claude Opus 4.7 --- examples/native/src/lib.rs | 177 ++++++++++++++++-- examples/python/ffi_table_provider_demo.py | 24 ++- .../examples/ExampleFfiProviderFactory.java | 78 +++++++- .../examples/FfiTableProviderExample.java | 5 +- .../FfiTableProviderExampleNative.java | 14 +- 5 files changed, 260 insertions(+), 38 deletions(-) diff --git a/examples/native/src/lib.rs b/examples/native/src/lib.rs index 9d6406f..12f4805 100644 --- a/examples/native/src/lib.rs +++ b/examples/native/src/lib.rs @@ -23,6 +23,19 @@ //! The same pattern is what domain bridges (Rerun, HDF5, custom Iceberg) use //! to expose their TableProviders to DataFusion-Java — and, transitively, to //! Spark via the connector-core DataSource V2 plumbing. +//! +//! ## Options wire format +//! +//! `createMemTableProvider` accepts an opaque `byte[]` that the JVM-side +//! `ExampleFfiProviderFactory.encodeOptions` produces. Layout (little-endian): +//! +//! ```text +//! [u32 name_prefix_len][name_prefix UTF-8 bytes][u32 num_rows][u32 num_batches] +//! ``` +//! +//! Empty/`null` bytes decode as all defaults: `name_prefix="row"`, `num_rows=4`, +//! `num_batches=1`. Real bridges use a real proto schema here; this example +//! hand-rolls the encoding to keep the wire layer obvious. use std::sync::Arc; @@ -34,7 +47,7 @@ use datafusion::execution::TaskContextProvider; use datafusion::prelude::SessionContext; use datafusion_ffi::execution::FFI_TaskContextProvider; use datafusion_ffi::table_provider::FFI_TableProvider; -use jni::objects::JClass; +use jni::objects::{JByteArray, JClass}; use jni::sys::jlong; use jni::JNIEnv; use tokio::runtime::{Handle, Runtime}; @@ -57,37 +70,115 @@ fn host_session_context() -> &'static Arc { CTX.get_or_init(|| Arc::new(SessionContext::new())) } -/// Build the example schema + a single-batch in-memory table. -fn build_mem_table() -> Result, Box> { +#[derive(Debug)] +struct Options { + name_prefix: String, + num_rows: u32, + num_batches: u32, +} + +impl Default for Options { + fn default() -> Self { + Self { + name_prefix: "row".to_string(), + num_rows: 4, + num_batches: 1, + } + } +} + +fn decode_options( + bytes: &[u8], +) -> Result> { + if bytes.is_empty() { + return Ok(Options::default()); + } + if bytes.len() < 4 { + return Err("options blob too short for name_prefix length prefix".into()); + } + let name_len = u32::from_le_bytes(bytes[0..4].try_into().unwrap()) as usize; + let name_end = 4 + name_len; + if bytes.len() < name_end + 8 { + return Err("options blob truncated: missing name_prefix bytes or trailing ints".into()); + } + let name_prefix = std::str::from_utf8(&bytes[4..name_end]) + .map_err(|e| format!("name_prefix is not valid UTF-8: {e}"))? + .to_string(); + let num_rows = u32::from_le_bytes(bytes[name_end..name_end + 4].try_into().unwrap()); + let num_batches = + u32::from_le_bytes(bytes[name_end + 4..name_end + 8].try_into().unwrap()); + if num_rows == 0 || num_batches == 0 { + return Err("num_rows and num_batches must both be > 0".into()); + } + Ok(Options { + name_prefix, + num_rows, + num_batches, + }) +} + +/// Build the example schema + a multi-batch in-memory table sized per `opts`. +/// Row `r` in batch `b` gets `id = b * num_rows + r`, `name = ""`, +/// `value = id * 1.5` (with `value` left null for every fourth row so the demo +/// still exercises null handling). +fn build_mem_table( + opts: &Options, +) -> Result, Box> { let schema = Arc::new(ArrowSchema::new(vec![ Field::new("id", DataType::Int64, false), Field::new("name", DataType::Utf8, true), Field::new("value", DataType::Float64, true), ])); - let ids = Int64Array::from(vec![1, 2, 3, 4]); - let names = StringArray::from(vec![Some("alice"), Some("bob"), None, Some("dave")]); - let values = Float64Array::from(vec![Some(1.5), Some(2.5), Some(3.5), None]); - - let batch = RecordBatch::try_new( - Arc::clone(&schema), - vec![Arc::new(ids), Arc::new(names), Arc::new(values)], - )?; + let mut batches = Vec::with_capacity(opts.num_batches as usize); + for b in 0..opts.num_batches { + let mut ids = Vec::with_capacity(opts.num_rows as usize); + let mut names: Vec> = Vec::with_capacity(opts.num_rows as usize); + let mut values: Vec> = Vec::with_capacity(opts.num_rows as usize); + for r in 0..opts.num_rows { + let id = (b as i64) * (opts.num_rows as i64) + (r as i64); + ids.push(id); + names.push(Some(format!("{}{}", opts.name_prefix, id))); + values.push(if id % 4 == 3 { None } else { Some(id as f64 * 1.5) }); + } + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(Int64Array::from(ids)), + Arc::new(StringArray::from(names)), + Arc::new(Float64Array::from(values)), + ], + )?; + batches.push(batch); + } - Ok(Arc::new(MemTable::try_new(schema, vec![vec![batch]])?)) + // Wrap all batches inside a single MemTable partition so the example stays + // single-partition end-to-end; configuring DataFusion-level partitions + // would need separate plumbing in the Spark connector to surface them. + Ok(Arc::new(MemTable::try_new(schema, vec![batches])?)) } -/// JNI entry point: build a small `MemTable`, wrap it in an `FFI_TableProvider`, -/// return the raw boxed pointer as a `jlong`. Ownership of the boxed FFI -/// transfers to the caller — the matching `Box::from_raw` is performed by -/// `SessionContext.registerFfiTable` on the consumer side. +/// JNI entry point: decode the options blob, build a `MemTable` accordingly, +/// wrap it in an `FFI_TableProvider`, return the raw boxed pointer as a `jlong`. +/// Ownership of the boxed FFI transfers to the caller — the matching +/// `Box::from_raw` is performed by `SessionContext.registerFfiTable` on the +/// consumer side. #[no_mangle] pub extern "system" fn Java_org_apache_datafusion_examples_FfiTableProviderExampleNative_createMemTableProvider<'local>( mut env: JNIEnv<'local>, _class: JClass<'local>, + options_bytes: JByteArray<'local>, ) -> jlong { let result: Result> = (|| { - let mem_table = build_mem_table()?; + let bytes: Vec = if options_bytes.is_null() { + Vec::new() + } else { + env.convert_byte_array(&options_bytes) + .map_err(|e| format!("failed to read options byte[] from JVM: {e}"))? + }; + let opts = decode_options(&bytes)?; + + let mem_table = build_mem_table(&opts)?; let provider: Arc = mem_table; let ctx_provider: Arc = @@ -128,3 +219,55 @@ pub extern "system" fn Java_org_apache_datafusion_examples_FfiTableProviderExamp } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn empty_bytes_decodes_to_defaults() { + let o = decode_options(&[]).unwrap(); + assert_eq!(o.name_prefix, "row"); + assert_eq!(o.num_rows, 4); + assert_eq!(o.num_batches, 1); + } + + #[test] + fn roundtrip_decodes_options() { + let prefix = "user"; + let mut buf = Vec::new(); + buf.extend_from_slice(&(prefix.len() as u32).to_le_bytes()); + buf.extend_from_slice(prefix.as_bytes()); + buf.extend_from_slice(&5u32.to_le_bytes()); + buf.extend_from_slice(&3u32.to_le_bytes()); + let o = decode_options(&buf).unwrap(); + assert_eq!(o.name_prefix, "user"); + assert_eq!(o.num_rows, 5); + assert_eq!(o.num_batches, 3); + } + + #[test] + fn build_table_has_expected_schema() { + let opts = Options { + name_prefix: "user".to_string(), + num_rows: 5, + num_batches: 3, + }; + let table = build_mem_table(&opts).unwrap(); + let schema = table.schema(); + assert_eq!(schema.fields().len(), 3); + assert_eq!(schema.field(0).name(), "id"); + assert_eq!(schema.field(1).name(), "name"); + assert_eq!(schema.field(2).name(), "value"); + } + + #[test] + fn rejects_zero_counts() { + let mut buf = Vec::new(); + buf.extend_from_slice(&3u32.to_le_bytes()); + buf.extend_from_slice(b"abc"); + buf.extend_from_slice(&0u32.to_le_bytes()); + buf.extend_from_slice(&1u32.to_le_bytes()); + assert!(decode_options(&buf).is_err()); + } +} diff --git a/examples/python/ffi_table_provider_demo.py b/examples/python/ffi_table_provider_demo.py index 64128ea..00389ca 100644 --- a/examples/python/ffi_table_provider_demo.py +++ b/examples/python/ffi_table_provider_demo.py @@ -147,26 +147,40 @@ def main() -> None: # otherwise set example.ffi.lib.path via spark.driver.extraJavaOptions. os.chdir(REPO_ROOT) + # `name_prefix`, `num_rows`, `num_batches` are interpreted by + # ExampleFfiProviderFactory.encodeOptions and decoded on the Rust side + # in examples/native/src/lib.rs. They demonstrate driver-side options + # flowing through to the native MemTable build. + name_prefix = "user" + num_rows = 5 + num_batches = 3 df = ( spark.read.format("datafusion") .option( "df.factory", "org.apache.datafusion.examples.ExampleFfiProviderFactory", ) + .option("name_prefix", name_prefix) + .option("num_rows", str(num_rows)) + .option("num_batches", str(num_batches)) .load() ) + total_rows = num_rows * num_batches + print(f"=== options: name_prefix={name_prefix} num_rows={num_rows} num_batches={num_batches} ===") + print(f"=== expecting {total_rows} rows across {num_batches} Arrow batches ===") + print("=== schema ===") df.printSchema() - print("=== full scan ===") - df.show(truncate=False) + print(f"=== full scan (first {total_rows} rows) ===") + df.show(n=total_rows, truncate=False) - print("=== filter pushdown: value > 2.0 ===") - df.filter("value > 2.0").show(truncate=False) + print("=== filter pushdown: value > 5.0 ===") + df.filter("value > 5.0").show(n=total_rows, truncate=False) print("=== projection: id, name ===") - df.select("id", "name").show(truncate=False) + df.select("id", "name").show(n=total_rows, truncate=False) spark.stop() diff --git a/examples/src/main/java/org/apache/datafusion/examples/ExampleFfiProviderFactory.java b/examples/src/main/java/org/apache/datafusion/examples/ExampleFfiProviderFactory.java index 8e6e30a..04fd1d9 100644 --- a/examples/src/main/java/org/apache/datafusion/examples/ExampleFfiProviderFactory.java +++ b/examples/src/main/java/org/apache/datafusion/examples/ExampleFfiProviderFactory.java @@ -19,36 +19,77 @@ package org.apache.datafusion.examples; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.charset.StandardCharsets; import java.util.Map; import io.datafusion.spark.FfiProviderFactory; /** * Minimal {@link FfiProviderFactory} that exposes the example {@code MemTable} produced by {@link - * FfiTableProviderExampleNative#createMemTableProvider()} as a Spark DataSource V2 source. + * FfiTableProviderExampleNative#createMemTableProvider(byte[])} as a Spark DataSource V2 source. * *

Wire it into PySpark with: * *

{@code
  * df = (spark.read.format("datafusion")
  *         .option("df.factory", "org.apache.datafusion.examples.ExampleFfiProviderFactory")
+ *         .option("name_prefix", "user")
+ *         .option("num_rows", "5")
+ *         .option("num_batches", "3")
  *         .load())
  * }
* - *

No driver-side options are interpreted — the underlying {@code MemTable} is hard-coded in the - * cdylib at {@code examples/native}. A single partition (id {@code "p0"}) is reported so Spark - * spawns one task; the executor calls {@link #createProvider(byte[])} to obtain a fresh {@code - * FFI_TableProvider} pointer, hands it to {@link - * org.apache.datafusion.SessionContext#registerFfiTable(String, long)}, and streams the resulting - * Arrow record batches back into the Spark scan. + *

Supported options (all optional): + * + *

    + *
  • {@code name_prefix} — prefix string used for generated {@code name} column values. Default + * {@code "row"}. + *
  • {@code num_rows} — rows per batch. Default {@code 4}. + *
  • {@code num_batches} — number of in-memory {@code RecordBatch}es composing the table. Default + * {@code 1}. + *
+ * + *

Real bridges (Rerun, HDF5, custom Iceberg) use a protobuf schema for {@code optionsProtoBytes}; + * this example uses a hand-rolled length-prefixed binary format to keep the wire layer obvious: + * + *

+ *   [u32 LE name_prefix_len][name_prefix UTF-8 bytes][u32 LE num_rows][u32 LE num_batches]
+ * 
+ * + *

An empty {@code byte[]} is also accepted by the native side and decoded as all defaults. + * + *

A single partition (id {@code "p0"}) is reported so Spark spawns one task; the executor calls + * {@link #createProvider(byte[])} to obtain a fresh {@code FFI_TableProvider} pointer, hands it to + * {@link org.apache.datafusion.SessionContext#registerFfiTable(String, long)}, and streams the + * resulting Arrow record batches back into the Spark scan. */ public final class ExampleFfiProviderFactory implements FfiProviderFactory { + static final String OPT_NAME_PREFIX = "name_prefix"; + static final String OPT_NUM_ROWS = "num_rows"; + static final String OPT_NUM_BATCHES = "num_batches"; + + static final String DEFAULT_NAME_PREFIX = "row"; + static final int DEFAULT_NUM_ROWS = 4; + static final int DEFAULT_NUM_BATCHES = 1; + public ExampleFfiProviderFactory() {} @Override public byte[] encodeOptions(Map sparkOptions) { - return new byte[0]; + String namePrefix = sparkOptions.getOrDefault(OPT_NAME_PREFIX, DEFAULT_NAME_PREFIX); + int numRows = parsePositiveInt(sparkOptions, OPT_NUM_ROWS, DEFAULT_NUM_ROWS); + int numBatches = parsePositiveInt(sparkOptions, OPT_NUM_BATCHES, DEFAULT_NUM_BATCHES); + + byte[] nameBytes = namePrefix.getBytes(StandardCharsets.UTF_8); + ByteBuffer buf = ByteBuffer.allocate(4 + nameBytes.length + 4 + 4).order(ByteOrder.LITTLE_ENDIAN); + buf.putInt(nameBytes.length); + buf.put(nameBytes); + buf.putInt(numRows); + buf.putInt(numBatches); + return buf.array(); } @Override @@ -58,6 +99,25 @@ public String[] listPartitions(byte[] optionsProtoBytes) { @Override public long createProvider(byte[] optionsProtoBytes) { - return FfiTableProviderExampleNative.createMemTableProvider(); + return FfiTableProviderExampleNative.createMemTableProvider(optionsProtoBytes); + } + + private static int parsePositiveInt(Map opts, String key, int defaultValue) { + String raw = opts.get(key); + if (raw == null || raw.isEmpty()) { + return defaultValue; + } + int parsed; + try { + parsed = Integer.parseInt(raw.trim()); + } catch (NumberFormatException e) { + throw new IllegalArgumentException( + "ExampleFfiProviderFactory: option '" + key + "' must be an integer, got: " + raw); + } + if (parsed <= 0) { + throw new IllegalArgumentException( + "ExampleFfiProviderFactory: option '" + key + "' must be > 0, got: " + parsed); + } + return parsed; } } diff --git a/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExample.java b/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExample.java index bcb1765..baa5dae 100644 --- a/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExample.java +++ b/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExample.java @@ -58,8 +58,9 @@ private FfiTableProviderExample() {} public static void main(String[] args) throws Exception { // Build the FFI provider on the Rust side. The returned `long` is a // `Box::into_raw(Box::new(FFI_TableProvider))` pointer; ownership flows - // through `registerFfiTable` into the SessionContext. - long ffiProviderPtr = FfiTableProviderExampleNative.createMemTableProvider(); + // through `registerFfiTable` into the SessionContext. Empty options bytes + // pick the native defaults (name_prefix="row", num_rows=4, num_batches=1). + long ffiProviderPtr = FfiTableProviderExampleNative.createMemTableProvider(new byte[0]); if (ffiProviderPtr == 0) { throw new IllegalStateException("Native FFI provider builder returned 0"); } diff --git a/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExampleNative.java b/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExampleNative.java index ed1eaf1..ebade50 100644 --- a/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExampleNative.java +++ b/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExampleNative.java @@ -55,12 +55,16 @@ private FfiTableProviderExampleNative() {} } /** - * Build a tiny {@code MemTable} on the Rust side, wrap it in an {@code FFI_TableProvider}, and - * return the raw boxed pointer as a {@code long}. Ownership transfers to the caller; passing the - * pointer to {@link org.apache.datafusion.SessionContext#registerFfiTable(String, long)} - * discharges it. + * Build a {@code MemTable} on the Rust side, wrap it in an {@code FFI_TableProvider}, and return + * the raw boxed pointer as a {@code long}. Ownership transfers to the caller; passing the pointer + * to {@link org.apache.datafusion.SessionContext#registerFfiTable(String, long)} discharges it. + * + *

{@code optionsBytes} is the length-prefixed binary blob produced by {@link + * ExampleFfiProviderFactory#encodeOptions(java.util.Map)}. An empty or {@code null} array + * decodes as all defaults ({@code name_prefix="row"}, {@code num_rows=4}, {@code + * num_batches=1}). */ - static native long createMemTableProvider(); + static native long createMemTableProvider(byte[] optionsBytes); /** * Drop an FFI_TableProvider pointer that was NEVER handed to {@code From e8c70a9b2c6ebe61b0978251760b4cad86eacc08 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 10 Jun 2026 18:02:07 +0200 Subject: [PATCH 06/22] update examples to build after last commit --- .../examples/ExampleFfiProviderFactory.java | 12 +++++++----- .../examples/FfiTableProviderExampleNative.java | 14 ++++++-------- pom.xml | 5 +++++ 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/examples/src/main/java/org/apache/datafusion/examples/ExampleFfiProviderFactory.java b/examples/src/main/java/org/apache/datafusion/examples/ExampleFfiProviderFactory.java index 04fd1d9..18b9668 100644 --- a/examples/src/main/java/org/apache/datafusion/examples/ExampleFfiProviderFactory.java +++ b/examples/src/main/java/org/apache/datafusion/examples/ExampleFfiProviderFactory.java @@ -47,12 +47,13 @@ *

  • {@code name_prefix} — prefix string used for generated {@code name} column values. Default * {@code "row"}. *
  • {@code num_rows} — rows per batch. Default {@code 4}. - *
  • {@code num_batches} — number of in-memory {@code RecordBatch}es composing the table. Default - * {@code 1}. + *
  • {@code num_batches} — number of in-memory {@code RecordBatch}es composing the table. + * Default {@code 1}. * * - *

    Real bridges (Rerun, HDF5, custom Iceberg) use a protobuf schema for {@code optionsProtoBytes}; - * this example uses a hand-rolled length-prefixed binary format to keep the wire layer obvious: + *

    Real bridges (Rerun, HDF5, custom Iceberg) use a protobuf schema for {@code + * optionsProtoBytes}; this example uses a hand-rolled length-prefixed binary format to keep the + * wire layer obvious: * *

      *   [u32 LE name_prefix_len][name_prefix UTF-8 bytes][u32 LE num_rows][u32 LE num_batches]
    @@ -84,7 +85,8 @@ public byte[] encodeOptions(Map sparkOptions) {
         int numBatches = parsePositiveInt(sparkOptions, OPT_NUM_BATCHES, DEFAULT_NUM_BATCHES);
     
         byte[] nameBytes = namePrefix.getBytes(StandardCharsets.UTF_8);
    -    ByteBuffer buf = ByteBuffer.allocate(4 + nameBytes.length + 4 + 4).order(ByteOrder.LITTLE_ENDIAN);
    +    ByteBuffer buf =
    +        ByteBuffer.allocate(4 + nameBytes.length + 4 + 4).order(ByteOrder.LITTLE_ENDIAN);
         buf.putInt(nameBytes.length);
         buf.put(nameBytes);
         buf.putInt(numRows);
    diff --git a/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExampleNative.java b/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExampleNative.java
    index ebade50..612fd8d 100644
    --- a/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExampleNative.java
    +++ b/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExampleNative.java
    @@ -34,11 +34,10 @@
      *
      * 
      *
    1. Absolute path passed via {@code -Dexample.ffi.lib.path=/abs/path/to/lib...}. - *
    2. {@code rust-target/release/} relative to the current working directory - * (the workspace output dir; default when invoked via {@code mvn exec:java} from the - * repo root). - *
    3. {@code rust-target/debug/} as a fallback for {@code cargo build} without - * {@code --release}. + *
    4. {@code rust-target/release/} relative to the current working directory (the + * workspace output dir; default when invoked via {@code mvn exec:java} from the repo root). + *
    5. {@code rust-target/debug/} as a fallback for {@code cargo build} without {@code + * --release}. *
    * *

    If none of these exist, an {@link UnsatisfiedLinkError} surfaces with the search list so the @@ -60,9 +59,8 @@ private FfiTableProviderExampleNative() {} * to {@link org.apache.datafusion.SessionContext#registerFfiTable(String, long)} discharges it. * *

    {@code optionsBytes} is the length-prefixed binary blob produced by {@link - * ExampleFfiProviderFactory#encodeOptions(java.util.Map)}. An empty or {@code null} array - * decodes as all defaults ({@code name_prefix="row"}, {@code num_rows=4}, {@code - * num_batches=1}). + * ExampleFfiProviderFactory#encodeOptions(java.util.Map)}. An empty or {@code null} array decodes + * as all defaults ({@code name_prefix="row"}, {@code num_rows=4}, {@code num_batches=1}). */ static native long createMemTableProvider(byte[] optionsBytes); diff --git a/pom.xml b/pom.xml index 0bf5612..8ff131c 100644 --- a/pom.xml +++ b/pom.xml @@ -96,6 +96,11 @@ under the License. + + org.apache.maven.plugins + maven-compiler-plugin + 3.13.0 + org.apache.maven.plugins maven-surefire-plugin From 088474d8699aeb6abd2900cda65da018d0a3e712 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 10 Jun 2026 18:29:00 +0200 Subject: [PATCH 07/22] feat(spark): per-partition payload + preferred locations in FFI factory Replace `String[] listPartitions` with `PartitionInfo[]` carrying an opaque per-partition byte payload and optional host hints, and pass that payload to `createProvider(opts, partitionBytes)` on the executor. The partition record overrides `preferredLocations()` so Spark co-locates tasks with the data. This is the connector-API change needed before bridges can split a dataset across N tasks (each materialising only its slice) or pin partitions to specific Spark workers. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../examples/ExampleFfiProviderFactory.java | 17 ++++-- .../datafusion/spark/FfiProviderFactory.java | 44 ++++++++++----- .../io/datafusion/spark/PartitionInfo.java | 56 +++++++++++++++++++ .../io/datafusion/spark/DatafusionBatch.scala | 15 +++-- .../DatafusionColumnarPartitionReader.scala | 7 ++- .../spark/DatafusionInputPartition.scala | 22 ++++++-- .../datafusion/spark/DatafusionSource.scala | 4 +- 7 files changed, 130 insertions(+), 35 deletions(-) create mode 100644 spark/src/main/java/io/datafusion/spark/PartitionInfo.java diff --git a/examples/src/main/java/org/apache/datafusion/examples/ExampleFfiProviderFactory.java b/examples/src/main/java/org/apache/datafusion/examples/ExampleFfiProviderFactory.java index 18b9668..f2890c5 100644 --- a/examples/src/main/java/org/apache/datafusion/examples/ExampleFfiProviderFactory.java +++ b/examples/src/main/java/org/apache/datafusion/examples/ExampleFfiProviderFactory.java @@ -25,6 +25,7 @@ import java.util.Map; import io.datafusion.spark.FfiProviderFactory; +import io.datafusion.spark.PartitionInfo; /** * Minimal {@link FfiProviderFactory} that exposes the example {@code MemTable} produced by {@link @@ -61,9 +62,10 @@ * *

    An empty {@code byte[]} is also accepted by the native side and decoded as all defaults. * - *

    A single partition (id {@code "p0"}) is reported so Spark spawns one task; the executor calls - * {@link #createProvider(byte[])} to obtain a fresh {@code FFI_TableProvider} pointer, hands it to - * {@link org.apache.datafusion.SessionContext#registerFfiTable(String, long)}, and streams the + *

    A single partition (id {@code "p0"}, empty {@code partitionBytes}, no preferred host) is + * reported so Spark spawns one task; the executor calls {@link #createProvider(byte[], byte[])} + * to obtain a fresh {@code FFI_TableProvider} pointer, hands it to {@link + * org.apache.datafusion.SessionContext#registerFfiTable(String, long)}, and streams the * resulting Arrow record batches back into the Spark scan. */ public final class ExampleFfiProviderFactory implements FfiProviderFactory { @@ -95,12 +97,15 @@ public byte[] encodeOptions(Map sparkOptions) { } @Override - public String[] listPartitions(byte[] optionsProtoBytes) { - return new String[] {"p0"}; + public PartitionInfo[] listPartitions(byte[] optionsProtoBytes) { + // Single partition; the example MemTable is not actually sliced. A real bridge would + // populate `partitionBytes` per slice and `preferredLocations` with the hosts holding it. + return new PartitionInfo[] {new PartitionInfo("p0", new byte[0], new String[0])}; } @Override - public long createProvider(byte[] optionsProtoBytes) { + public long createProvider(byte[] optionsProtoBytes, byte[] partitionBytes) { + // The example bridge has no per-partition state; `partitionBytes` is ignored. return FfiTableProviderExampleNative.createMemTableProvider(optionsProtoBytes); } diff --git a/spark/src/main/java/io/datafusion/spark/FfiProviderFactory.java b/spark/src/main/java/io/datafusion/spark/FfiProviderFactory.java index 1d01f70..3443f3a 100644 --- a/spark/src/main/java/io/datafusion/spark/FfiProviderFactory.java +++ b/spark/src/main/java/io/datafusion/spark/FfiProviderFactory.java @@ -31,12 +31,16 @@ *

      *
    1. {@link #encodeOptions(Map)} — driver-side, converts the Spark options map into the bridge's * own proto bytes; ships verbatim through {@code DatafusionInputPartition}. - *
    2. {@link #listPartitions(byte[])} — driver-side, enumerates partition identifiers (e.g. Rerun - * segment ids) so each gets its own Spark task. - *
    3. {@link #createProvider(byte[])} — executor-side, builds the bridge's {@code Arc<dyn - * TableProvider>}, wraps it in an {@code FFI_TableProvider}, returns the raw boxed pointer - * as a {@code jlong}. The caller owns this pointer and is responsible for handing it to - * exactly one consumer (the consumer's {@code Drop} releases it). + *
    4. {@link #listPartitions(byte[])} — driver-side, enumerates partitions as {@link + * PartitionInfo} entries. One Spark task is created per entry. Each entry carries an opaque + * {@code partitionBytes} payload that is shipped to the executor and replayed into {@link + * #createProvider(byte[], byte[])}, plus optional {@code preferredLocations} hostnames that + * drive Spark's data-locality scheduling. + *
    5. {@link #createProvider(byte[], byte[])} — executor-side, builds the bridge's {@code + * Arc<dyn TableProvider>} for this specific partition, wraps it in an {@code + * FFI_TableProvider}, returns the raw boxed pointer as a {@code jlong}. The caller owns + * this pointer and is responsible for handing it to exactly one consumer (the consumer's + * {@code Drop} releases it). *
    * *

    Implementations must be no-arg constructable so the Spark connector can instantiate them @@ -52,15 +56,29 @@ public interface FfiProviderFactory { byte[] encodeOptions(Map sparkOptions); /** - * Enumerate partition identifiers for this dataset. One Spark task is created per returned id. - * Driver-side only. + * Enumerate partitions for this dataset. One Spark task is created per returned {@link + * PartitionInfo}. Driver-side only. + * + *

    Each partition's {@code partitionBytes} ships verbatim through {@code + * DatafusionInputPartition} to the executor, where it is passed to {@link + * #createProvider(byte[], byte[])}. Use it to encode whatever slice metadata (row range, + * sub-options, file offsets, segment id, …) the bridge needs to materialise *that* partition. + * + *

    Each partition's {@code preferredLocations} hostnames are returned from {@code + * InputPartition.preferredLocations()} so Spark co-locates the task with the data; empty array + * = no preference. */ - String[] listPartitions(byte[] optionsProtoBytes); + PartitionInfo[] listPartitions(byte[] optionsProtoBytes); /** - * Build the underlying {@code Arc} and wrap it in an {@code - * FFI_TableProvider}. Returns the raw {@code Box::into_raw} pointer as a {@code jlong}; the - * caller takes ownership. + * Build the underlying {@code Arc} for one partition and wrap it in an + * {@code FFI_TableProvider}. Returns the raw {@code Box::into_raw} pointer as a {@code jlong}; + * the caller takes ownership. + * + * @param optionsProtoBytes global options produced by {@link #encodeOptions(Map)} + * @param partitionBytes per-partition slice payload from {@link PartitionInfo#partitionBytes()}. + * Empty array for single-partition tables and for the driver-side schema probe in {@code + * DatafusionSource.inferSchema}. */ - long createProvider(byte[] optionsProtoBytes); + long createProvider(byte[] optionsProtoBytes, byte[] partitionBytes); } diff --git a/spark/src/main/java/io/datafusion/spark/PartitionInfo.java b/spark/src/main/java/io/datafusion/spark/PartitionInfo.java new file mode 100644 index 0000000..a653eac --- /dev/null +++ b/spark/src/main/java/io/datafusion/spark/PartitionInfo.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.datafusion.spark; + +/** + * Driver-side descriptor for a single partition produced by {@link + * FfiProviderFactory#listPartitions(byte[])}. Carries the bridge-specific slice payload that the + * executor passes back into {@link FfiProviderFactory#createProvider(byte[], byte[])}, plus + * optional host hints for Spark's scheduler. + * + *

    Fields: + * + *

      + *
    • {@code id} — stable, human-readable identifier for this partition (e.g. a Rerun segment + * id). Surfaces in Spark UI, logs, and exception messages. Must be non-empty. + *
    • {@code partitionBytes} — opaque per-partition payload. Bridge encodes whatever the + * executor needs to materialise *this* slice (offsets, row ranges, sub-options, etc.). + * Combined with the global {@code optionsProtoBytes} in {@link + * FfiProviderFactory#createProvider(byte[], byte[])}. Empty array = no per-partition state + * (single-partition table). + *
    • {@code preferredLocations} — hostnames where this partition's data lives. Returned from + * {@code InputPartition.preferredLocations()} so Spark can co-locate the task with the + * data. Empty array = no preference. Honoured subject to {@code spark.locality.wait}. + *
    + */ +public record PartitionInfo(String id, byte[] partitionBytes, String[] preferredLocations) { + + public PartitionInfo { + if (id == null || id.isEmpty()) { + throw new IllegalArgumentException("PartitionInfo: id must be non-empty"); + } + if (partitionBytes == null) { + partitionBytes = new byte[0]; + } + if (preferredLocations == null) { + preferredLocations = new String[0]; + } + } +} diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionBatch.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionBatch.scala index 466651a..69be838 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionBatch.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionBatch.scala @@ -23,17 +23,18 @@ import org.apache.spark.sql.connector.read.{Batch, InputPartition, PartitionRead /** * Spark `Batch` for a DataFusion-backed scan. Owns: - * - partition planning (driver-side: `factory.listPartitions` enumerates partition ids → one - * task per id) + * - partition planning (driver-side: `factory.listPartitions` enumerates `PartitionInfo` + * entries → one task per entry; each task receives that partition's `partitionBytes` and + * `preferredLocations`) * - per-task reader factory ([[DatafusionPartitionReaderFactory]]) */ class DatafusionBatch(val scan: DatafusionScan) extends Batch { override def planInputPartitions(): Array[InputPartition] = { val factory = instantiateFactory(scan.factoryFqcn) - val partitionIds: Array[String] = factory.listPartitions(scan.optionsProtoBytes) + val partitions: Array[PartitionInfo] = factory.listPartitions(scan.optionsProtoBytes) - if (partitionIds == null || partitionIds.isEmpty) { + if (partitions == null || partitions.isEmpty) { throw new IllegalStateException( s"FfiProviderFactory '${scan.factoryFqcn}' returned no partitions to scan" ) @@ -42,13 +43,15 @@ class DatafusionBatch(val scan: DatafusionScan) extends Batch { val projection = scan.prunedSchema.fieldNames val filterBytes: Array[Array[Byte]] = scan.pushedPredicateBytes - partitionIds.iterator.map { id => + partitions.iterator.map { p => DatafusionInputPartition( factoryFqcn = scan.factoryFqcn, optionsProtoBytes = scan.optionsProtoBytes, projectionColumnNames = projection, filterProtoBytes = filterBytes, - partitionId = id + partitionId = p.id, + partitionBytes = p.partitionBytes, + preferredLocs = p.preferredLocations ).asInstanceOf[InputPartition] }.toArray } diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionColumnarPartitionReader.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionColumnarPartitionReader.scala index e777f99..5bf5a35 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionColumnarPartitionReader.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionColumnarPartitionReader.scala @@ -31,8 +31,9 @@ import org.apache.spark.sql.vectorized.{ColumnVector, ColumnarBatch} * Per-task columnar reader. Lifecycle: * * 1. Reflectively instantiate the bridge's `FfiProviderFactory` (no-arg). - * 2. `createProvider(optionsProtoBytes)` — bridge builds an `Arc`, wraps it - * in an `FFI_TableProvider`, returns the raw pointer. + * 2. `createProvider(optionsProtoBytes, partitionBytes)` — bridge builds an `Arc` materialising the slice described by `partitionBytes`, wraps it in an + * `FFI_TableProvider`, returns the raw pointer. * 3. Hand that pointer to connector-core's widening cdylib via `FfiHelperNative.wrapWithWidening`. * The cdylib wraps the inner provider in a `WideningTableProvider` (kernel-level * `arrow::compute::cast` for Spark-incompatible Arrow types) and re-FFIs it. @@ -55,7 +56,7 @@ class DatafusionColumnarPartitionReader( private val factory: FfiProviderFactory = instantiateFactory(partition.factoryFqcn) private val df: DataFrame = { - val rawPtr = factory.createProvider(partition.optionsProtoBytes) + val rawPtr = factory.createProvider(partition.optionsProtoBytes, partition.partitionBytes) val widenedPtr = FfiHelperNative.wrapWithWidening(rawPtr) ctx.registerFfiTable(TableName, widenedPtr) var d = ctx.sql(buildSql()) diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionInputPartition.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionInputPartition.scala index 9d66d2b..c3c54c1 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionInputPartition.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionInputPartition.scala @@ -25,18 +25,28 @@ import org.apache.spark.sql.connector.read.InputPartition * Per-task payload shipped from driver to executor via Java serialization. * * - `factoryFqcn`: fully-qualified class name of the bridge's `FfiProviderFactory`. The - * executor reflectively instantiates this and calls `createProvider(optionsProtoBytes)`. - * - `optionsProtoBytes`: bridge-specific connection options, encoded by the bridge. Opaque to - * connector-core. + * executor reflectively instantiates this and calls `createProvider(optionsProtoBytes, + * partitionBytes)`. + * - `optionsProtoBytes`: bridge-specific global connection options, encoded by the bridge. + * Opaque to connector-core. Same bytes ride along on every partition. * - `projectionColumnNames`: pruned column list (post-`pruneColumns`). * - `filterProtoBytes`: V2 `Predicate` → DataFusion `LogicalExprNode` proto bytes; each one is * applied via `DataFrame.filterFromProto`. - * - `partitionId`: stable identifier (e.g. Rerun segment id) — for `preferredLocations` / debug. + * - `partitionId`: stable identifier (e.g. Rerun segment id) — surfaces in Spark UI/logs/errors. + * - `partitionBytes`: opaque per-partition payload from `PartitionInfo.partitionBytes`. Passed + * back into `createProvider` so the bridge materialises *this* slice. + * - `preferredLocs`: hostnames where this partition's data lives; returned from + * `preferredLocations()` so Spark schedules the task there subject to `spark.locality.wait`. */ final case class DatafusionInputPartition( factoryFqcn: String, optionsProtoBytes: Array[Byte], projectionColumnNames: Array[String], filterProtoBytes: Array[Array[Byte]], - partitionId: String -) extends InputPartition + partitionId: String, + partitionBytes: Array[Byte], + preferredLocs: Array[String] +) extends InputPartition { + + override def preferredLocations(): Array[String] = preferredLocs +} diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionSource.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionSource.scala index 629060e..bcf0f99 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionSource.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionSource.scala @@ -67,7 +67,9 @@ class DatafusionSource extends TableProvider with DataSourceRegister { val arrowSchema = { val ctx = new SessionContext() try { - val rawPtr = factory.createProvider(optionsBytes) + // Schema probe: pass empty partitionBytes — bridges are required to honour an empty + // payload for the driver-side probe (schema must not depend on per-partition state). + val rawPtr = factory.createProvider(optionsBytes, Array.emptyByteArray) val widenedPtr = FfiHelperNative.wrapWithWidening(rawPtr) ctx.registerFfiTable("__df_schema_probe__", widenedPtr) ctx.tableSchema("__df_schema_probe__") From f7d3972a8d93ff1cc931cb55e7e7c3a6477ad9c3 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 10 Jun 2026 18:29:44 +0200 Subject: [PATCH 08/22] docs(examples): update SPARK_INTEGRATION for PartitionInfo + per-slice createProvider Reflect the FfiProviderFactory signature change: `listPartitions` returns `PartitionInfo[]` carrying the per-partition payload + preferred hosts, and `createProvider` now takes `(opts, partitionBytes)`. Explain the slice/locality semantics and refresh the "what runs where" table. Co-Authored-By: Claude Opus 4.7 (1M context) --- examples/SPARK_INTEGRATION.md | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/examples/SPARK_INTEGRATION.md b/examples/SPARK_INTEGRATION.md index 690a91a..0e01d74 100644 --- a/examples/SPARK_INTEGRATION.md +++ b/examples/SPARK_INTEGRATION.md @@ -75,8 +75,16 @@ Box::into_raw(Box::new(ffi)) as jlong ``` Driver-side partition enumeration goes through a second JNI entrypoint -`listPartitions(options_proto_bytes) -> String[]`. One Spark task gets created -per returned id. +`listPartitions(options_proto_bytes) -> PartitionInfo[]`. One Spark task gets +created per returned entry. Each `PartitionInfo` carries: + +- `id` — stable, human-readable partition identifier (surfaces in Spark UI/logs). +- `partitionBytes` — opaque per-partition payload, replayed into + `createProvider(opts, partitionBytes)` so the executor materialises *this* + slice. Empty array = no per-partition state. +- `preferredLocations` — hostnames where this slice's data lives. Spark uses + these (subject to `spark.locality.wait`) to co-locate the task with the + data — e.g. four partitions per worker on a 3-worker cluster. ## JVM glue @@ -95,13 +103,22 @@ public final class MyBridgeProviderFactory implements FfiProviderFactory { } @Override - public String[] listPartitions(byte[] optionsProtoBytes) { - return MyBridgeNative.listPartitions(optionsProtoBytes); + public PartitionInfo[] listPartitions(byte[] optionsProtoBytes) { + // Bridge enumerates slices and resolves their host placement: + // record MySlice(String id, byte[] payload, String[] hosts) {} + MySlice[] slices = MyBridgeNative.listSlices(optionsProtoBytes); + PartitionInfo[] out = new PartitionInfo[slices.length]; + for (int i = 0; i < slices.length; i++) { + out[i] = new PartitionInfo(slices[i].id(), slices[i].payload(), slices[i].hosts()); + } + return out; } @Override - public long createProvider(byte[] optionsProtoBytes) { - return MyBridgeNative.createFfiProvider(optionsProtoBytes); + public long createProvider(byte[] optionsProtoBytes, byte[] partitionBytes) { + // partitionBytes is the same payload returned from listPartitions for *this* task. + // The driver-side schema probe passes an empty array; honour it. + return MyBridgeNative.createFfiProvider(optionsProtoBytes, partitionBytes); } } ``` @@ -150,10 +167,10 @@ df = (spark.read.format("my_format") | Phase | Where | Path | | --------------------------- | --------- | ---- | -| `inferSchema` | Driver | `factory.encodeOptions` → `factory.createProvider` → widen → `registerFfiTable` → `ctx.tableSchema` | -| `planInputPartitions` | Driver | `factory.listPartitions(optionsBytes)` → one task per id | +| `inferSchema` | Driver | `factory.encodeOptions` → `factory.createProvider(opts, EMPTY)` → widen → `registerFfiTable` → `ctx.tableSchema` | +| `planInputPartitions` | Driver | `factory.listPartitions(optionsBytes)` → one task per `PartitionInfo`; each task gets that entry's `partitionBytes` + `preferredLocations` | | Predicate translation | Driver | `SparkPredicateTranslator.translate(Predicate)` → `LogicalExprNode` proto bytes (each pushed predicate is independent) | -| Per-task scan | Executor | Same factory → widen → `registerFfiTable` → `ctx.sql("SELECT proj FROM t")` → fold `DataFrame.filterFromProto(bytes)` over pushed predicates → `executeStream` | +| Per-task scan | Executor | Same factory → `createProvider(opts, partitionBytes)` → widen → `registerFfiTable` → `ctx.sql("SELECT proj FROM t")` → fold `DataFrame.filterFromProto(bytes)` over pushed predicates → `executeStream` | ## Caveats From daa3ba541a79e1915b5e4b82e2b11c6e7672baae Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 10 Jun 2026 20:25:16 +0200 Subject: [PATCH 09/22] feat(spark): SupportsReportPartitioning via optional reportPartitioning hook MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a `reportPartitioning(opts)` default method on `FfiProviderFactory` that returns a `ReportedPartitioning` (Transform[] keys, derived numPartitions) or null for unknown. `DatafusionScan` now implements `SupportsReportPartitioning` and emits `KeyGroupedPartitioning(keys, partitions.length)` when the bridge opts in, `UnknownPartitioning` otherwise — letting Spark's optimizer skip shuffles ahead of compatible joins/aggregations. Move the `listPartitions` + `reportPartitioning` call sites into `DatafusionScanBuilder.build()` so the Scan has both facts cached up front; `DatafusionBatch.planInputPartitions` now reuses the cached `PartitionInfo[]` instead of re-invoking the factory. `ReportedPartitioning` ships with `identity(cols…)` and `bucket(n, cols…)` convenience builders for the common cases. Co-Authored-By: Claude Opus 4.7 (1M context) --- examples/SPARK_INTEGRATION.md | 14 ++- .../datafusion/spark/FfiProviderFactory.java | 19 ++++ .../spark/ReportedPartitioning.java | 87 +++++++++++++++++++ .../io/datafusion/spark/DatafusionBatch.scala | 22 +---- .../io/datafusion/spark/DatafusionScan.scala | 27 +++++- .../spark/DatafusionScanBuilder.scala | 33 ++++++- 6 files changed, 177 insertions(+), 25 deletions(-) create mode 100644 spark/src/main/java/io/datafusion/spark/ReportedPartitioning.java diff --git a/examples/SPARK_INTEGRATION.md b/examples/SPARK_INTEGRATION.md index 0e01d74..d78b70c 100644 --- a/examples/SPARK_INTEGRATION.md +++ b/examples/SPARK_INTEGRATION.md @@ -120,6 +120,16 @@ public final class MyBridgeProviderFactory implements FfiProviderFactory { // The driver-side schema probe passes an empty array; honour it. return MyBridgeNative.createFfiProvider(optionsProtoBytes, partitionBytes); } + + @Override + public ReportedPartitioning reportPartitioning(byte[] optionsProtoBytes) { + // Optional. Return non-null only when each PartitionInfo's rows all share the same + // key tuple under the declared transforms — Spark elides shuffles ahead of joins/aggs + // grouped on those keys. Return null when the layout is unknown or row-key mapping + // would be lossy. + return ReportedPartitioning.identity("device_id"); + // or: ReportedPartitioning.bucket(numBuckets, "user_id"); + } } ``` @@ -168,7 +178,9 @@ df = (spark.read.format("my_format") | Phase | Where | Path | | --------------------------- | --------- | ---- | | `inferSchema` | Driver | `factory.encodeOptions` → `factory.createProvider(opts, EMPTY)` → widen → `registerFfiTable` → `ctx.tableSchema` | -| `planInputPartitions` | Driver | `factory.listPartitions(optionsBytes)` → one task per `PartitionInfo`; each task gets that entry's `partitionBytes` + `preferredLocations` | +| `ScanBuilder.build` | Driver | `factory.listPartitions(optionsBytes)` (cached on Scan) + `factory.reportPartitioning(optionsBytes)` (cached on Scan) | +| `outputPartitioning` | Driver | `KeyGroupedPartitioning(reported.keys, partitions.length)` when bridge declared one; `UnknownPartitioning(partitions.length)` otherwise. Spark may elide shuffles when keys line up with downstream join/agg grouping. | +| `planInputPartitions` | Driver | Reuses the cached `PartitionInfo[]`; one task per entry with that entry's `partitionBytes` + `preferredLocations` | | Predicate translation | Driver | `SparkPredicateTranslator.translate(Predicate)` → `LogicalExprNode` proto bytes (each pushed predicate is independent) | | Per-task scan | Executor | Same factory → `createProvider(opts, partitionBytes)` → widen → `registerFfiTable` → `ctx.sql("SELECT proj FROM t")` → fold `DataFrame.filterFromProto(bytes)` over pushed predicates → `executeStream` | diff --git a/spark/src/main/java/io/datafusion/spark/FfiProviderFactory.java b/spark/src/main/java/io/datafusion/spark/FfiProviderFactory.java index 3443f3a..5a9a262 100644 --- a/spark/src/main/java/io/datafusion/spark/FfiProviderFactory.java +++ b/spark/src/main/java/io/datafusion/spark/FfiProviderFactory.java @@ -81,4 +81,23 @@ public interface FfiProviderFactory { * DatafusionSource.inferSchema}. */ long createProvider(byte[] optionsProtoBytes, byte[] partitionBytes); + + /** + * Declare how rows are partitioned across the {@link PartitionInfo} entries returned by {@link + * #listPartitions(byte[])}. Driver-side only. + * + *

    When non-null, the connector surfaces a {@code KeyGroupedPartitioning(keys, + * listPartitions(...).length)} to Spark via {@code SupportsReportPartitioning} so the optimizer + * can elide shuffles ahead of joins/aggregations on the declared keys. + * + *

    Default returns {@code null} — no partitioning guarantees, Spark plans as if the scan's + * output ordering and grouping are unknown. + * + *

    If a bridge implements this, it must hold the {@link ReportedPartitioning} contract: every + * row in a given partition evaluates to the same tuple of key values under the declared + * transforms. + */ + default ReportedPartitioning reportPartitioning(byte[] optionsProtoBytes) { + return null; + } } diff --git a/spark/src/main/java/io/datafusion/spark/ReportedPartitioning.java b/spark/src/main/java/io/datafusion/spark/ReportedPartitioning.java new file mode 100644 index 0000000..06de668 --- /dev/null +++ b/spark/src/main/java/io/datafusion/spark/ReportedPartitioning.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.datafusion.spark; + +import java.util.Arrays; + +import org.apache.spark.sql.connector.expressions.Expressions; +import org.apache.spark.sql.connector.expressions.Transform; + +/** + * Driver-side declaration of how a bridge's data is partitioned on the key columns. When supplied + * via {@link FfiProviderFactory#reportPartitioning(byte[])}, the connector surfaces a {@link + * org.apache.spark.sql.connector.read.partitioning.KeyGroupedPartitioning} from {@link + * org.apache.spark.sql.connector.read.SupportsReportPartitioning#outputPartitioning()} — Spark's + * optimizer can then skip the shuffle ahead of joins/aggregations whose grouping keys line up + * with these transforms. + * + *

    Contract: for any partition reported by {@link FfiProviderFactory#listPartitions(byte[])}, + * every row produced by that partition must evaluate to the same tuple of key values under these + * transforms. Different partitions may share key values (Spark will fuse them); they + * must not straddle key values. + * + *

    The partition count Spark sees is {@code listPartitions(...).length}; it is not carried here + * to keep a single source of truth. + */ +public final class ReportedPartitioning { + + private final Transform[] keys; + + public ReportedPartitioning(Transform[] keys) { + if (keys == null || keys.length == 0) { + throw new IllegalArgumentException( + "ReportedPartitioning: keys must contain at least one transform"); + } + this.keys = keys; + } + + public Transform[] keys() { + return keys; + } + + /** + * Convenience: declare identity partitioning on one or more columns (a row in partition P has + * the same {@code (col1, col2, …)} values as every other row in P). + */ + public static ReportedPartitioning identity(String... columns) { + if (columns == null || columns.length == 0) { + throw new IllegalArgumentException( + "ReportedPartitioning.identity: at least one column required"); + } + Transform[] ts = Arrays.stream(columns).map(Expressions::identity).toArray(Transform[]::new); + return new ReportedPartitioning(ts); + } + + /** + * Convenience: declare hash-bucket partitioning. Mirrors Spark's {@code bucket(N, cols…)} + * transform — each row is assigned to bucket {@code hash(cols) mod numBuckets}. + */ + public static ReportedPartitioning bucket(int numBuckets, String... columns) { + if (numBuckets <= 0) { + throw new IllegalArgumentException( + "ReportedPartitioning.bucket: numBuckets must be > 0, got " + numBuckets); + } + if (columns == null || columns.length == 0) { + throw new IllegalArgumentException( + "ReportedPartitioning.bucket: at least one column required"); + } + return new ReportedPartitioning(new Transform[] {Expressions.bucket(numBuckets, columns)}); + } +} diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionBatch.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionBatch.scala index 69be838..0464854 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionBatch.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionBatch.scala @@ -23,27 +23,18 @@ import org.apache.spark.sql.connector.read.{Batch, InputPartition, PartitionRead /** * Spark `Batch` for a DataFusion-backed scan. Owns: - * - partition planning (driver-side: `factory.listPartitions` enumerates `PartitionInfo` - * entries → one task per entry; each task receives that partition's `partitionBytes` and - * `preferredLocations`) + * - partition planning (driver-side: reuses the `PartitionInfo[]` already resolved by + * [[DatafusionScanBuilder]] — one task per entry; each task receives that entry's + * `partitionBytes` + `preferredLocations`) * - per-task reader factory ([[DatafusionPartitionReaderFactory]]) */ class DatafusionBatch(val scan: DatafusionScan) extends Batch { override def planInputPartitions(): Array[InputPartition] = { - val factory = instantiateFactory(scan.factoryFqcn) - val partitions: Array[PartitionInfo] = factory.listPartitions(scan.optionsProtoBytes) - - if (partitions == null || partitions.isEmpty) { - throw new IllegalStateException( - s"FfiProviderFactory '${scan.factoryFqcn}' returned no partitions to scan" - ) - } - val projection = scan.prunedSchema.fieldNames val filterBytes: Array[Array[Byte]] = scan.pushedPredicateBytes - partitions.iterator.map { p => + scan.partitions.iterator.map { p => DatafusionInputPartition( factoryFqcn = scan.factoryFqcn, optionsProtoBytes = scan.optionsProtoBytes, @@ -58,9 +49,4 @@ class DatafusionBatch(val scan: DatafusionScan) extends Batch { override def createReaderFactory(): PartitionReaderFactory = new DatafusionPartitionReaderFactory(scan.prunedSchema) - - private def instantiateFactory(fqcn: String): FfiProviderFactory = { - val cls = Class.forName(fqcn) - cls.getDeclaredConstructor().newInstance().asInstanceOf[FfiProviderFactory] - } } diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionScan.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionScan.scala index 90f3cad..755fa9f 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionScan.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionScan.scala @@ -20,13 +20,24 @@ package io.datafusion.spark import org.apache.spark.sql.connector.expressions.filter.Predicate -import org.apache.spark.sql.connector.read.{Batch, Scan} +import org.apache.spark.sql.connector.read.{Batch, Scan, SupportsReportPartitioning} +import org.apache.spark.sql.connector.read.partitioning.{ + KeyGroupedPartitioning, + Partitioning, + UnknownPartitioning +} import org.apache.spark.sql.types.StructType /** * Read plan for a DataFusion-backed scan. Holds pruning state, the pushed predicates (for * `description()` / `explain(True)`), and the corresponding `LogicalExprNode` proto byte arrays * the executor applies via `DataFrame.filterFromProto`. + * + * Also carries the driver-resolved `PartitionInfo[]` (so [[DatafusionBatch]] doesn't re-call + * `listPartitions`) and the optional bridge-declared [[ReportedPartitioning]]; when present, the + * scan surfaces a `KeyGroupedPartitioning` via `SupportsReportPartitioning` so Spark's optimizer + * can skip shuffles ahead of compatible joins/aggregations. When absent, an + * `UnknownPartitioning(partitions.length)` is reported (still correct, just no shuffle elision). */ class DatafusionScan( val factoryFqcn: String, @@ -34,14 +45,22 @@ class DatafusionScan( val fullSchema: StructType, val prunedSchema: StructType, val pushedPredicates: Array[Predicate], - val pushedPredicateBytes: Array[Array[Byte]] -) extends Scan { + val pushedPredicateBytes: Array[Array[Byte]], + val partitions: Array[PartitionInfo], + val reportedPartitioning: ReportedPartitioning +) extends Scan + with SupportsReportPartitioning { override def readSchema(): StructType = prunedSchema override def description(): String = s"DatafusionScan(factory=$factoryFqcn, projection=${prunedSchema.fieldNames.mkString(",")}," + - s" pushedPredicates=${pushedPredicates.length})" + s" pushedPredicates=${pushedPredicates.length}, partitions=${partitions.length}," + + s" reportedPartitioning=${if (reportedPartitioning == null) "unknown" else "key-grouped"})" override def toBatch: Batch = new DatafusionBatch(this) + + override def outputPartitioning(): Partitioning = + if (reportedPartitioning == null) new UnknownPartitioning(partitions.length) + else new KeyGroupedPartitioning(reportedPartitioning.keys().toArray, partitions.length) } diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionScanBuilder.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionScanBuilder.scala index 63ef8ce..f1fe354 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionScanBuilder.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionScanBuilder.scala @@ -30,6 +30,12 @@ import org.apache.spark.sql.types.StructType * Pushdown discipline: over-claiming Exact = wrong results, under-claiming = full scans. The * translator (see [[SparkPredicateTranslator]]) only emits proto for predicates it can encode * losslessly — anything else returns `None` and lands in residuals. + * + * `build()` is also where we resolve driver-side facts that the optimizer needs *before* it + * starts asking the [[DatafusionScan]] about its output partitioning: the partition list + * (`listPartitions`) and the bridge's optional [[ReportedPartitioning]]. Resolving both here once + * and threading them onto the Scan keeps `DatafusionBatch.planInputPartitions` shuffle-free and + * lets `outputPartitioning()` answer without an extra factory call per query. */ class DatafusionScanBuilder( factoryFqcn: String, @@ -71,6 +77,29 @@ class DatafusionScanBuilder( pruned = requiredSchema } - override def build(): Scan = - new DatafusionScan(factoryFqcn, optionsProtoBytes, fullSchema, pruned, pushed, pushedBytes) + override def build(): Scan = { + val factory = instantiateFactory(factoryFqcn) + val partitions: Array[PartitionInfo] = factory.listPartitions(optionsProtoBytes) + if (partitions == null || partitions.isEmpty) { + throw new IllegalStateException( + s"FfiProviderFactory '$factoryFqcn' returned no partitions to scan" + ) + } + val reported: ReportedPartitioning = factory.reportPartitioning(optionsProtoBytes) + new DatafusionScan( + factoryFqcn, + optionsProtoBytes, + fullSchema, + pruned, + pushed, + pushedBytes, + partitions, + reported + ) + } + + private def instantiateFactory(fqcn: String): FfiProviderFactory = { + val cls = Class.forName(fqcn) + cls.getDeclaredConstructor().newInstance().asInstanceOf[FfiProviderFactory] + } } From c926d3d7eaafd3999444f7380aea6443403e5788 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 11 Jun 2026 09:00:39 +0200 Subject: [PATCH 10/22] feat(spark): shared-scan mode with per-executor provider cache One provider per task makes createProvider cost dominate scans with thousands of small partitions. Shared-scan mode (opt-in via FfiProviderFactory.sharedScan) builds the provider once per (executor JVM x query), plans it once, and maps one Spark task onto each DataFusion output partition via the new DataFrame.toPartitionedExecution / PartitionedExecution core API. - SharedScanCache: scanId-keyed, refcounted, idle-TTL evicted; exactly-once build per task wave; failures not cached - PinnedSessionConfig: driver-resolved session knobs shipped to executors (default target_partitions is core-count-dependent and would desync partition indices across machines); executors fail fast on partition-count divergence - listPartitions(opts, filters) overload: bridges can prune whole partitions from pushed predicates - PartitionInfo.partitionKeyValues + HasPartitionKey wiring: the reported KeyGroupedPartitioning was inert on Spark 3.3+ without per-partition key values - legacy reader: close intermediate DataFrames in the filter fold Determinism contract (snapshot pinning, re-executable execute(i)) documented on FfiProviderFactory.sharedScan and in SPARK_INTEGRATION.md; equal partition counts with different contents are undetectable by construction. BREAKING CHANGE: PartitionInfo's canonical record constructor gains a fourth component (partitionKeyValues). Source-compatible via the 3-arg delegating constructor; binary compatibility with pre-existing compiled bridges is broken. --- .../java/org/apache/datafusion/DataFrame.java | 19 ++ .../datafusion/PartitionedExecution.java | 114 ++++++++++ .../datafusion/PartitionedExecutionTest.java | 196 +++++++++++++++++ examples/SPARK_INTEGRATION.md | 83 +++++++- examples/native/src/lib.rs | 118 +++++++++-- examples/python/ffi_table_provider_demo.py | 51 +++++ .../examples/ExampleFfiProviderFactory.java | 61 +++++- native/src/lib.rs | 7 +- native/src/partitioned_execution.rs | 169 +++++++++++++++ spark/native/src/widening.rs | 59 ++++-- .../datafusion/spark/FfiProviderFactory.java | 78 ++++++- .../io/datafusion/spark/PartitionInfo.java | 34 ++- .../spark/ReportedPartitioning.java | 12 +- .../spark/ArrowColumnarBatchIteration.scala | 58 ++++++ .../io/datafusion/spark/DatafusionBatch.scala | 107 ++++++++-- .../DatafusionColumnarPartitionReader.scala | 67 ++---- .../spark/DatafusionInputPartition.scala | 69 +++++- .../DatafusionPartitionReaderFactory.scala | 16 +- .../io/datafusion/spark/DatafusionScan.scala | 68 ++++-- .../spark/DatafusionScanBuilder.scala | 74 +++++-- .../spark/DatafusionSqlBuilder.scala | 41 ++++ .../spark/NativeSharedScanResources.scala | 112 ++++++++++ .../spark/PinnedSessionConfig.scala | 91 ++++++++ .../io/datafusion/spark/SharedScanCache.scala | 197 ++++++++++++++++++ .../spark/SharedScanPartitionReader.scala | 82 ++++++++ .../FfiProviderFactoryDefaultsTest.scala | 77 +++++++ .../spark/PartitionKeyConversionTest.scala | 76 +++++++ .../spark/SharedScanCacheTest.scala | 195 +++++++++++++++++ 28 files changed, 2160 insertions(+), 171 deletions(-) create mode 100644 core/src/main/java/org/apache/datafusion/PartitionedExecution.java create mode 100644 core/src/test/java/org/apache/datafusion/PartitionedExecutionTest.java create mode 100644 native/src/partitioned_execution.rs create mode 100644 spark/src/main/scala/io/datafusion/spark/ArrowColumnarBatchIteration.scala create mode 100644 spark/src/main/scala/io/datafusion/spark/DatafusionSqlBuilder.scala create mode 100644 spark/src/main/scala/io/datafusion/spark/NativeSharedScanResources.scala create mode 100644 spark/src/main/scala/io/datafusion/spark/PinnedSessionConfig.scala create mode 100644 spark/src/main/scala/io/datafusion/spark/SharedScanCache.scala create mode 100644 spark/src/main/scala/io/datafusion/spark/SharedScanPartitionReader.scala create mode 100644 spark/src/test/scala/io/datafusion/spark/FfiProviderFactoryDefaultsTest.scala create mode 100644 spark/src/test/scala/io/datafusion/spark/PartitionKeyConversionTest.scala create mode 100644 spark/src/test/scala/io/datafusion/spark/SharedScanCacheTest.scala diff --git a/core/src/main/java/org/apache/datafusion/DataFrame.java b/core/src/main/java/org/apache/datafusion/DataFrame.java index c9d1183..39834cc 100644 --- a/core/src/main/java/org/apache/datafusion/DataFrame.java +++ b/core/src/main/java/org/apache/datafusion/DataFrame.java @@ -113,6 +113,23 @@ public ArrowReader executeStream(BufferAllocator allocator) { } } + /** + * Plan this DataFrame once and return a {@link PartitionedExecution} that can stream each + * physical-plan output partition independently (and concurrently from multiple threads). + * + *

    Consumes this DataFrame with the same lifecycle rules as {@link + * #executeStream(BufferAllocator)}: the logical plan is released into the planned execution, and + * the caller owns (and must close) the returned handle. + */ + public PartitionedExecution toPartitionedExecution() { + if (nativeHandle == 0) { + throw new IllegalStateException("DataFrame is closed or already collected"); + } + long handle = nativeHandle; + nativeHandle = 0; + return new PartitionedExecution(createPartitionedExecution(handle)); + } + /** * Return the Arrow {@link Schema} of this DataFrame's output. Non-consuming: the receiver remains * usable and must still be closed independently. Schema inspection does not execute the plan. @@ -806,6 +823,8 @@ public void close() { private static native void executeStreamDataFrame(long handle, long ffiStreamAddr); + private static native long createPartitionedExecution(long handle); + private static native void closeDataFrame(long handle); private static native long countRows(long handle); diff --git a/core/src/main/java/org/apache/datafusion/PartitionedExecution.java b/core/src/main/java/org/apache/datafusion/PartitionedExecution.java new file mode 100644 index 0000000..8b9ce08 --- /dev/null +++ b/core/src/main/java/org/apache/datafusion/PartitionedExecution.java @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datafusion; + +import org.apache.arrow.c.ArrowArrayStream; +import org.apache.arrow.c.Data; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.ipc.ArrowReader; + +/** + * A DataFrame planned exactly once, exposing its physical plan's output partitions for individual + * streaming. Obtained via {@link DataFrame#toPartitionedExecution()}. + * + *

    Unlike {@link DataFrame#executeStream(BufferAllocator)}, which coalesces every output + * partition into one stream, this handle lets distinct threads drive distinct partitions — e.g. one + * Spark task per DataFusion partition. + * + *

    Thread safety. {@link #partitionCount()} and {@link #executeStream(int, + * BufferAllocator)} are safe to call concurrently from multiple threads on the same instance. + * Re-executing the same partition index more than once opens an independent native stream each + * time, but only succeeds when every operator in that partition's pipeline supports repeated {@code + * execute()} — stateless scans (MemTable, table providers) do; {@code RepartitionExec} pipelines + * (hash aggregates, joins) do not and fail the second stream. {@link #close()} is idempotent, but + * the caller must guarantee that no {@code executeStream} call is in flight and that all returned + * readers have been closed before calling it — the native plan is freed immediately. Consumers that + * share one instance across threads must enforce that ordering themselves (e.g. with a reference + * count). + */ +public final class PartitionedExecution implements AutoCloseable { + static { + NativeLibraryLoader.loadLibrary(); + } + + private volatile long nativeHandle; + + PartitionedExecution(long nativeHandle) { + if (nativeHandle == 0) { + throw new IllegalStateException("Failed to create native PartitionedExecution"); + } + this.nativeHandle = nativeHandle; + } + + /** Number of output partitions of the planned physical plan. */ + public int partitionCount() { + long handle = nativeHandle; + if (handle == 0) { + throw new IllegalStateException("PartitionedExecution is closed"); + } + return partitionCountNative(handle); + } + + /** + * Open an independent stream over one plan partition. Each call to {@link + * ArrowReader#loadNextBatch} drives one async {@code stream.next()} on the native side, so memory + * pressure stays bounded by the executor pipeline plus one in-flight batch. + * + *

    Non-consuming: this instance remains usable, and concurrent calls — including for the same + * partition index — are safe. The caller closes the returned reader; the supplied allocator must + * outlive it. + * + * @param partition partition index in {@code [0, partitionCount())} + * @throws RuntimeException if the index is out of range for the planned partitioning + */ + public ArrowReader executeStream(int partition, BufferAllocator allocator) { + long handle = nativeHandle; + if (handle == 0) { + throw new IllegalStateException("PartitionedExecution is closed"); + } + ArrowArrayStream stream = ArrowArrayStream.allocateNew(allocator); + try { + executeStreamPartition(handle, partition, stream.memoryAddress()); + return Data.importArrayStream(allocator, stream); + } catch (Throwable e) { + stream.close(); + throw e; + } + } + + /** + * Release the native plan. Idempotent. See the class Javadoc for the ordering contract with + * in-flight {@link #executeStream(int, BufferAllocator)} calls. + */ + @Override + public void close() { + long handle = nativeHandle; + if (handle != 0) { + nativeHandle = 0; + closePartitionedExecution(handle); + } + } + + private static native int partitionCountNative(long handle); + + private static native void executeStreamPartition(long handle, int partition, long ffiStreamAddr); + + private static native void closePartitionedExecution(long handle); +} diff --git a/core/src/test/java/org/apache/datafusion/PartitionedExecutionTest.java b/core/src/test/java/org/apache/datafusion/PartitionedExecutionTest.java new file mode 100644 index 0000000..74d320a --- /dev/null +++ b/core/src/test/java/org/apache/datafusion/PartitionedExecutionTest.java @@ -0,0 +1,196 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datafusion; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.ipc.ArrowReader; +import org.junit.jupiter.api.Test; + +class PartitionedExecutionTest { + + /** + * A plan whose physical form reliably keeps {@code targetPartitions} output partitions: the + * hash-repartitioned GROUP BY can't be collapsed by the physical optimizer, unlike a bare + * top-level round-robin repartition, which {@code EnforceDistribution} removes as non-beneficial. + */ + private static final String GROUPED_SQL = + "SELECT x FROM (VALUES (1), (2), (3), (4), (5), (6), (7), (8)) AS t(x) GROUP BY x"; + + private static final List EXPECTED_ROWS = List.of(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L); + + /** Drain one partition's reader into a list of long values from column 0. */ + private static List drain(ArrowReader reader) throws Exception { + List out = new ArrayList<>(); + while (reader.loadNextBatch()) { + BigIntVector v = (BigIntVector) reader.getVectorSchemaRoot().getVector(0); + for (int i = 0; i < v.getValueCount(); i++) { + out.add(v.get(i)); + } + } + return out; + } + + private static List drainPartition( + PartitionedExecution exec, int partition, BufferAllocator allocator) throws Exception { + try (ArrowReader reader = exec.executeStream(partition, allocator)) { + return drain(reader); + } + } + + @Test + void partitionCountMatchesTargetPartitions() throws Exception { + try (SessionContext ctx = SessionContext.builder().targetPartitions(4).build(); + DataFrame df = ctx.sql(GROUPED_SQL); + PartitionedExecution exec = df.toPartitionedExecution()) { + assertEquals(4, exec.partitionCount()); + } + } + + @Test + void unionOfPartitionsEqualsFullResult() throws Exception { + try (BufferAllocator allocator = new RootAllocator(); + SessionContext ctx = SessionContext.builder().targetPartitions(4).build(); + DataFrame df = ctx.sql(GROUPED_SQL); + PartitionedExecution exec = df.toPartitionedExecution()) { + assertEquals(4, exec.partitionCount()); + List all = new ArrayList<>(); + for (int p = 0; p < exec.partitionCount(); p++) { + all.addAll(drainPartition(exec, p, allocator)); + } + all.sort(Long::compare); + assertEquals(EXPECTED_ROWS, all); + } + } + + @Test + void concurrentPartitionStreamsAreIndependent() throws Exception { + try (BufferAllocator allocator = new RootAllocator(); + SessionContext ctx = SessionContext.builder().targetPartitions(4).build(); + DataFrame df = ctx.sql(GROUPED_SQL); + PartitionedExecution exec = df.toPartitionedExecution()) { + int n = exec.partitionCount(); + assertEquals(4, n); + ExecutorService pool = Executors.newFixedThreadPool(n); + try { + List>> jobs = new ArrayList<>(); + for (int p = 0; p < n; p++) { + final int partition = p; + jobs.add(() -> drainPartition(exec, partition, allocator)); + } + List all = new ArrayList<>(); + for (Future> f : pool.invokeAll(jobs)) { + all.addAll(f.get()); + } + all.sort(Long::compare); + assertEquals(EXPECTED_ROWS, all); + } finally { + pool.shutdownNow(); + } + } + } + + @Test + void samePartitionCanBeStreamedTwiceOnStatelessScans() throws Exception { + // Spark task retry / speculative execution re-executes a partition index. + // Re-execution is only supported by plans whose partitions are stateless + // scans (MemoryExec, table providers): a UNION ALL of two VALUES keeps one + // re-executable MemoryExec partition per branch. Pipelines containing + // RepartitionExec (e.g. a hash GROUP BY) panic on second execute -- its + // per-partition channels are single-use -- which is why this test does not + // reuse GROUPED_SQL. + String unionSql = + "SELECT * FROM (VALUES (1), (2)) AS t(x) UNION ALL SELECT * FROM (VALUES (3), (4)) AS t(x)"; + try (BufferAllocator allocator = new RootAllocator(); + SessionContext ctx = new SessionContext(); + DataFrame df = ctx.sql(unionSql); + PartitionedExecution exec = df.toPartitionedExecution()) { + assertEquals(2, exec.partitionCount()); + List firstTotal = new ArrayList<>(); + List secondTotal = new ArrayList<>(); + for (int p = 0; p < exec.partitionCount(); p++) { + firstTotal.addAll(drainPartition(exec, p, allocator)); + secondTotal.addAll(drainPartition(exec, p, allocator)); + } + firstTotal.sort(Long::compare); + secondTotal.sort(Long::compare); + assertEquals(List.of(1L, 2L, 3L, 4L), firstTotal); + assertEquals(firstTotal, secondTotal); + } + } + + @Test + void toPartitionedExecutionConsumesTheDataFrame() throws Exception { + try (BufferAllocator allocator = new RootAllocator(); + SessionContext ctx = new SessionContext()) { + DataFrame df = ctx.sql("SELECT 1"); + try (PartitionedExecution exec = df.toPartitionedExecution()) { + assertTrue(exec.partitionCount() >= 1); + } + assertThrows(IllegalStateException.class, () -> df.executeStream(allocator)); + assertThrows(IllegalStateException.class, df::toPartitionedExecution); + // close() on a consumed DataFrame stays a no-op (no double-free). + df.close(); + } + } + + @Test + void closeIsIdempotentAndBlocksFurtherUse() throws Exception { + try (BufferAllocator allocator = new RootAllocator(); + SessionContext ctx = new SessionContext(); + DataFrame df = ctx.sql("SELECT 1")) { + PartitionedExecution exec = df.toPartitionedExecution(); + exec.close(); + exec.close(); + assertThrows(IllegalStateException.class, exec::partitionCount); + assertThrows(IllegalStateException.class, () -> exec.executeStream(0, allocator)); + } + } + + @Test + void outOfRangePartitionThrowsClearError() throws Exception { + try (BufferAllocator allocator = new RootAllocator(); + SessionContext ctx = SessionContext.builder().targetPartitions(2).build(); + DataFrame df = ctx.sql(GROUPED_SQL); + PartitionedExecution exec = df.toPartitionedExecution()) { + assertEquals(2, exec.partitionCount()); + RuntimeException e = + assertThrows(RuntimeException.class, () -> exec.executeStream(7, allocator)); + assertTrue( + e.getMessage().contains("out of range"), + "expected out-of-range message, got: " + e.getMessage()); + assertThrows(RuntimeException.class, () -> exec.executeStream(-1, allocator)); + // The handle survives a failed executeStream call. + assertEquals(2, exec.partitionCount()); + } + } +} diff --git a/examples/SPARK_INTEGRATION.md b/examples/SPARK_INTEGRATION.md index d78b70c..e290b79 100644 --- a/examples/SPARK_INTEGRATION.md +++ b/examples/SPARK_INTEGRATION.md @@ -178,12 +178,93 @@ df = (spark.read.format("my_format") | Phase | Where | Path | | --------------------------- | --------- | ---- | | `inferSchema` | Driver | `factory.encodeOptions` → `factory.createProvider(opts, EMPTY)` → widen → `registerFfiTable` → `ctx.tableSchema` | -| `ScanBuilder.build` | Driver | `factory.listPartitions(optionsBytes)` (cached on Scan) + `factory.reportPartitioning(optionsBytes)` (cached on Scan) | +| `ScanBuilder.build` | Driver | `factory.listPartitions(optionsBytes, filterBytes)` (filter-aware overload — bridges can prune partitions; cached on Scan) + `factory.reportPartitioning(optionsBytes)` (cached on Scan) | | `outputPartitioning` | Driver | `KeyGroupedPartitioning(reported.keys, partitions.length)` when bridge declared one; `UnknownPartitioning(partitions.length)` otherwise. Spark may elide shuffles when keys line up with downstream join/agg grouping. | | `planInputPartitions` | Driver | Reuses the cached `PartitionInfo[]`; one task per entry with that entry's `partitionBytes` + `preferredLocations` | | Predicate translation | Driver | `SparkPredicateTranslator.translate(Predicate)` → `LogicalExprNode` proto bytes (each pushed predicate is independent) | | Per-task scan | Executor | Same factory → `createProvider(opts, partitionBytes)` → widen → `registerFfiTable` → `ctx.sql("SELECT proj FROM t")` → fold `DataFrame.filterFromProto(bytes)` over pushed predicates → `executeStream` | +## Partition key values (`HasPartitionKey`) + +Declaring `reportPartitioning` alone is NOT enough on Spark 3.3+: Spark's +`DataSourceV2ScanExecBase.groupPartitions` only consumes the declared +`KeyGroupedPartitioning` when every input partition also implements +`HasPartitionKey`. To activate it, return the key values per partition via +`PartitionInfo`'s 4-argument constructor: + +```java +new PartitionInfo(slice.id(), slice.payload(), slice.hosts(), + new Object[] {slice.segmentId()}); // matches identity("segment_id") +``` + +Rules: all partitions carry keys or none (mixed state fails the scan +driver-side); array arity must equal the declared key count; values must be +`CatalystTypeConverters`-convertible Java types (`String`, `Long`, +`java.time.Instant`, `java.time.LocalDate`, `java.math.BigDecimal`, ...). +Storage-partitioned joins additionally require +`spark.sql.sources.v2.bucketing.enabled=true`. + +## Shared-scan mode + +The default model above builds one provider per Spark task. For datasets with +thousands of small partitions — or providers whose construction is expensive +(remote metadata, connection setup) — the per-task fixed cost dominates. +Shared-scan mode flips the mapping: the bridge's provider is built ONCE per +(executor JVM × query) with empty `partitionBytes`, planned once, and Spark +runs one task per *DataFusion-native* output partition; task `i` streams plan +partition `i` from the cached plan. + +Opt in per dataset from the factory: + +```java +@Override +public boolean sharedScan(byte[] optionsProtoBytes) { + return MyBridgeOptions.fromProtoBytes(optionsProtoBytes).useSharedScan(); +} +``` + +What changes: + +| Phase | Where | Path | +| ---------------------- | -------- | ---- | +| `ScanBuilder.build` | Driver | mint `scanId` (UUID) + pin session config → probe build (same code path as executors) → physical plan partition count `N` → `N` tasks | +| `outputPartitioning` | Driver | always `UnknownPartitioning(N)` — DataFusion partitions carry no key contract; `listPartitions` / `reportPartitioning` are not called | +| Per-task scan | Executor | `SharedScanCache.acquire(scanId)` → (first task only) `createProvider(opts, EMPTY)` → widen → `registerFfiTable` on a pinned-config `SessionContext` → SQL + filters → plan once → every task `executeStream(partitionIndex)` → release | + +Cache semantics: entries are keyed by `scanId` (per query — separate actions +build separate entries), refcounted by open readers, and evicted after an idle +TTL. Build failures are not cached; eviction between task waves just rebuilds. + +Spark conf (all read driver-side at planning time and shipped to executors): + +- `spark.datafusion.sharedScan.targetPartitions` (default 8) — pinned + DataFusion `target_partitions`. Any constant works; it must merely be the + same everywhere, which shipping guarantees. +- `spark.datafusion.sharedScan.batchSize` (default 8192) +- `spark.datafusion.sharedScan.idleTtlMs` (default 120000) — cache idle + eviction window. + +**Determinism contract** (the price of admission — see +`FfiProviderFactory.sharedScan` Javadoc): the provider's schema, partitioning, +and per-partition contents must be a pure function of `optionsProtoBytes`. +Remote sources must pin a snapshot (version/timestamp) inside the options. +The connector fails tasks when an executor's partition count diverges from the +driver's, but equal counts with different contents are undetectable. The +provider's `ExecutionPlan` must also tolerate `execute(i)` being called more +than once per plan instance (task retry / speculative execution). + +Choosing a model: + +- **Per-partition payload (default)** — slices have host affinity + (`preferredLocations`), per-slice provider construction is cheap, or you + want `KeyGroupedPartitioning` + `HasPartitionKey` semantics. Bin-pack many + small slices into fewer `PartitionInfo` entries via `partitionBytes` (it is + opaque — encode a list of slice ids) before reaching for shared-scan. +- **Shared-scan** — thousands of small partitions, expensive + `createProvider`, no locality story, scan+filter+projection workloads. + Provider builds drop from one-per-task to one-per-executor (plus one driver + probe per query). + ## Caveats - One `FFI_LogicalExtensionCodec` per provider — v1 uses diff --git a/examples/native/src/lib.rs b/examples/native/src/lib.rs index 12f4805..e75a37d 100644 --- a/examples/native/src/lib.rs +++ b/examples/native/src/lib.rs @@ -31,11 +31,16 @@ //! //! ```text //! [u32 name_prefix_len][name_prefix UTF-8 bytes][u32 num_rows][u32 num_batches] +//! [u32 num_partitions][u8 shared_scan] <- optional trailing fields //! ``` //! //! Empty/`null` bytes decode as all defaults: `name_prefix="row"`, `num_rows=4`, -//! `num_batches=1`. Real bridges use a real proto schema here; this example -//! hand-rolls the encoding to keep the wire layer obvious. +//! `num_batches=1`, `num_partitions=1`, `shared_scan=false`. The trailing +//! fields are optional so blobs from older encoders keep decoding. The +//! `shared_scan` flag is consumed JVM-side (`ExampleFfiProviderFactory.sharedScan`); +//! this decoder carries it only so one blob format serves both sides. Real +//! bridges use a real proto schema here; this example hand-rolls the encoding +//! to keep the wire layer obvious. use std::sync::Arc; @@ -75,6 +80,7 @@ struct Options { name_prefix: String, num_rows: u32, num_batches: u32, + num_partitions: u32, } impl Default for Options { @@ -83,13 +89,12 @@ impl Default for Options { name_prefix: "row".to_string(), num_rows: 4, num_batches: 1, + num_partitions: 1, } } } -fn decode_options( - bytes: &[u8], -) -> Result> { +fn decode_options(bytes: &[u8]) -> Result> { if bytes.is_empty() { return Ok(Options::default()); } @@ -105,15 +110,25 @@ fn decode_options( .map_err(|e| format!("name_prefix is not valid UTF-8: {e}"))? .to_string(); let num_rows = u32::from_le_bytes(bytes[name_end..name_end + 4].try_into().unwrap()); - let num_batches = - u32::from_le_bytes(bytes[name_end + 4..name_end + 8].try_into().unwrap()); + let num_batches = u32::from_le_bytes(bytes[name_end + 4..name_end + 8].try_into().unwrap()); if num_rows == 0 || num_batches == 0 { return Err("num_rows and num_batches must both be > 0".into()); } + // Optional trailing fields (older encoders omit them): num_partitions, + // then the shared_scan flag byte, which only the JVM side interprets. + let num_partitions = if bytes.len() >= name_end + 12 { + u32::from_le_bytes(bytes[name_end + 8..name_end + 12].try_into().unwrap()) + } else { + 1 + }; + if num_partitions == 0 { + return Err("num_partitions must be > 0".into()); + } Ok(Options { name_prefix, num_rows, num_batches, + num_partitions, }) } @@ -139,7 +154,11 @@ fn build_mem_table( let id = (b as i64) * (opts.num_rows as i64) + (r as i64); ids.push(id); names.push(Some(format!("{}{}", opts.name_prefix, id))); - values.push(if id % 4 == 3 { None } else { Some(id as f64 * 1.5) }); + values.push(if id % 4 == 3 { + None + } else { + Some(id as f64 * 1.5) + }); } let batch = RecordBatch::try_new( Arc::clone(&schema), @@ -152,10 +171,16 @@ fn build_mem_table( batches.push(batch); } - // Wrap all batches inside a single MemTable partition so the example stays - // single-partition end-to-end; configuring DataFusion-level partitions - // would need separate plumbing in the Spark connector to surface them. - Ok(Arc::new(MemTable::try_new(schema, vec![batches])?)) + // Distribute the batches round-robin across `num_partitions` MemTable + // partitions. With num_partitions=1 the example stays single-partition; + // larger values give the Spark connector's shared-scan mode real + // DataFusion-native partitions to map tasks onto. Partitions beyond the + // batch count stay empty — DataFusion handles empty partitions fine. + let mut partitions: Vec> = vec![Vec::new(); opts.num_partitions as usize]; + for (i, batch) in batches.into_iter().enumerate() { + partitions[i % opts.num_partitions as usize].push(batch); + } + Ok(Arc::new(MemTable::try_new(schema, partitions)?)) } /// JNI entry point: decode the options blob, build a `MemTable` accordingly, @@ -164,7 +189,9 @@ fn build_mem_table( /// `Box::from_raw` is performed by `SessionContext.registerFfiTable` on the /// consumer side. #[no_mangle] -pub extern "system" fn Java_org_apache_datafusion_examples_FfiTableProviderExampleNative_createMemTableProvider<'local>( +pub extern "system" fn Java_org_apache_datafusion_examples_FfiTableProviderExampleNative_createMemTableProvider< + 'local, +>( mut env: JNIEnv<'local>, _class: JClass<'local>, options_bytes: JByteArray<'local>, @@ -208,7 +235,9 @@ pub extern "system" fn Java_org_apache_datafusion_examples_FfiTableProviderExamp /// pointer to `registerFfiTable` must NOT also call this; ownership has /// already transferred. #[no_mangle] -pub extern "system" fn Java_org_apache_datafusion_examples_FfiTableProviderExampleNative_dropProvider<'local>( +pub extern "system" fn Java_org_apache_datafusion_examples_FfiTableProviderExampleNative_dropProvider< + 'local, +>( _env: JNIEnv<'local>, _class: JClass<'local>, ffi_ptr: jlong, @@ -224,34 +253,81 @@ pub extern "system" fn Java_org_apache_datafusion_examples_FfiTableProviderExamp mod tests { use super::*; + fn encode(prefix: &str, num_rows: u32, num_batches: u32) -> Vec { + let mut buf = Vec::new(); + buf.extend_from_slice(&(prefix.len() as u32).to_le_bytes()); + buf.extend_from_slice(prefix.as_bytes()); + buf.extend_from_slice(&num_rows.to_le_bytes()); + buf.extend_from_slice(&num_batches.to_le_bytes()); + buf + } + #[test] fn empty_bytes_decodes_to_defaults() { let o = decode_options(&[]).unwrap(); assert_eq!(o.name_prefix, "row"); assert_eq!(o.num_rows, 4); assert_eq!(o.num_batches, 1); + assert_eq!(o.num_partitions, 1); } #[test] fn roundtrip_decodes_options() { - let prefix = "user"; - let mut buf = Vec::new(); - buf.extend_from_slice(&(prefix.len() as u32).to_le_bytes()); - buf.extend_from_slice(prefix.as_bytes()); - buf.extend_from_slice(&5u32.to_le_bytes()); - buf.extend_from_slice(&3u32.to_le_bytes()); - let o = decode_options(&buf).unwrap(); + let o = decode_options(&encode("user", 5, 3)).unwrap(); assert_eq!(o.name_prefix, "user"); assert_eq!(o.num_rows, 5); assert_eq!(o.num_batches, 3); } + #[test] + fn old_blob_without_trailing_fields_defaults_partitions_to_one() { + let o = decode_options(&encode("user", 5, 3)).unwrap(); + assert_eq!(o.num_partitions, 1); + } + + #[test] + fn trailing_fields_decode_num_partitions_and_ignore_flag_byte() { + let mut buf = encode("user", 5, 8); + buf.extend_from_slice(&4u32.to_le_bytes()); + buf.push(1); // shared_scan flag: JVM-side only + let o = decode_options(&buf).unwrap(); + assert_eq!(o.num_partitions, 4); + } + + #[test] + fn zero_partitions_rejected() { + let mut buf = encode("user", 5, 8); + buf.extend_from_slice(&0u32.to_le_bytes()); + buf.push(0); + assert!(decode_options(&buf).is_err()); + } + + #[test] + fn batches_distribute_round_robin_across_partitions() { + let opts = Options { + name_prefix: "u".to_string(), + num_rows: 2, + num_batches: 5, + num_partitions: 3, + }; + let table = build_mem_table(&opts).unwrap(); + // MemTable has no partition accessor; verify via scan output partitioning. + use datafusion::catalog::TableProvider; + let ctx = SessionContext::new(); + let rt = Runtime::new().unwrap(); + let plan = rt + .block_on(async { table.scan(&ctx.state(), None, &[], None).await }) + .unwrap(); + assert_eq!(plan.properties().output_partitioning().partition_count(), 3); + } + #[test] fn build_table_has_expected_schema() { let opts = Options { name_prefix: "user".to_string(), num_rows: 5, num_batches: 3, + num_partitions: 1, }; let table = build_mem_table(&opts).unwrap(); let schema = table.schema(); diff --git a/examples/python/ffi_table_provider_demo.py b/examples/python/ffi_table_provider_demo.py index 00389ca..45510bb 100644 --- a/examples/python/ffi_table_provider_demo.py +++ b/examples/python/ffi_table_provider_demo.py @@ -182,6 +182,57 @@ def main() -> None: print("=== projection: id, name ===") df.select("id", "name").show(n=total_rows, truncate=False) + legacy_rows = {tuple(r) for r in df.collect()} + + # --- shared-scan mode ------------------------------------------------- + # `shared_scan=true` flips ExampleFfiProviderFactory.sharedScan: one + # provider + plan cached per executor, one Spark task per MemTable + # partition (num_partitions=4), each task streaming one DataFusion plan + # partition. Results must be identical to the legacy run above. + num_partitions = 4 + shared = ( + spark.read.format("datafusion") + .option( + "df.factory", + "org.apache.datafusion.examples.ExampleFfiProviderFactory", + ) + .option("name_prefix", name_prefix) + .option("num_rows", str(num_rows)) + .option("num_batches", str(num_batches)) + .option("num_partitions", str(num_partitions)) + .option("shared_scan", "true") + .load() + ) + + print(f"=== shared-scan mode: num_partitions={num_partitions} ===") + shared_partitions = shared.rdd.getNumPartitions() + print(f"=== shared-scan Spark partitions: {shared_partitions} ===") + assert shared_partitions == num_partitions, ( + f"expected {num_partitions} Spark partitions in shared-scan mode, " + f"got {shared_partitions}" + ) + + shared.show(n=total_rows, truncate=False) + shared_rows = {tuple(r) for r in shared.collect()} + assert shared_rows == legacy_rows, ( + "shared-scan rows diverge from legacy mode: " + f"only-legacy={legacy_rows - shared_rows} only-shared={shared_rows - legacy_rows}" + ) + print(f"=== shared-scan returned the same {len(shared_rows)} rows as legacy mode ===") + + print("=== shared-scan filter pushdown: value > 5.0 ===") + shared.filter("value > 5.0").show(n=total_rows, truncate=False) + + # Note on cache scope: the executor cache is keyed by a per-query scanId, + # so sharing happens across the TASKS of one query (4 tasks above -> one + # provider build per executor JVM, observable via the factory's + # createProvider stdout line), not across separate actions. Each new + # action plans a new scan with a fresh scanId; its entry simply joins the + # cache until the idle TTL evicts it. + count_again = shared.count() + assert count_again == total_rows, f"expected {total_rows} rows, got {count_again}" + print("=== shared-scan count() as a separate action also succeeded ===") + spark.stop() diff --git a/examples/src/main/java/org/apache/datafusion/examples/ExampleFfiProviderFactory.java b/examples/src/main/java/org/apache/datafusion/examples/ExampleFfiProviderFactory.java index f2890c5..b1b90c2 100644 --- a/examples/src/main/java/org/apache/datafusion/examples/ExampleFfiProviderFactory.java +++ b/examples/src/main/java/org/apache/datafusion/examples/ExampleFfiProviderFactory.java @@ -50,6 +50,12 @@ *

  • {@code num_rows} — rows per batch. Default {@code 4}. *
  • {@code num_batches} — number of in-memory {@code RecordBatch}es composing the table. * Default {@code 1}. + *
  • {@code num_partitions} — number of DataFusion-native MemTable partitions the batches are + * distributed across (round-robin). Default {@code 1}. Mostly interesting together with + * {@code shared_scan}. + *
  • {@code shared_scan} — {@code true} opts into the connector's shared-scan mode: one cached + * provider + plan per executor, one Spark task per MemTable partition. Default {@code false} + * (single task via {@link #listPartitions(byte[])}). * * *

    Real bridges (Rerun, HDF5, custom Iceberg) use a protobuf schema for {@code @@ -58,14 +64,16 @@ * *

      *   [u32 LE name_prefix_len][name_prefix UTF-8 bytes][u32 LE num_rows][u32 LE num_batches]
    + *       [u32 LE num_partitions][u8 shared_scan]
      * 
    * - *

    An empty {@code byte[]} is also accepted by the native side and decoded as all defaults. + *

    An empty {@code byte[]} is also accepted by the native side and decoded as all defaults; the + * two trailing fields are optional so older blobs keep decoding. * - *

    A single partition (id {@code "p0"}, empty {@code partitionBytes}, no preferred host) is - * reported so Spark spawns one task; the executor calls {@link #createProvider(byte[], byte[])} - * to obtain a fresh {@code FFI_TableProvider} pointer, hands it to {@link - * org.apache.datafusion.SessionContext#registerFfiTable(String, long)}, and streams the + *

    In the default mode a single partition (id {@code "p0"}, empty {@code partitionBytes}, no + * preferred host) is reported so Spark spawns one task; the executor calls {@link + * #createProvider(byte[], byte[])} to obtain a fresh {@code FFI_TableProvider} pointer, hands it to + * {@link org.apache.datafusion.SessionContext#registerFfiTable(String, long)}, and streams the * resulting Arrow record batches back into the Spark scan. */ public final class ExampleFfiProviderFactory implements FfiProviderFactory { @@ -73,10 +81,13 @@ public final class ExampleFfiProviderFactory implements FfiProviderFactory { static final String OPT_NAME_PREFIX = "name_prefix"; static final String OPT_NUM_ROWS = "num_rows"; static final String OPT_NUM_BATCHES = "num_batches"; + static final String OPT_NUM_PARTITIONS = "num_partitions"; + static final String OPT_SHARED_SCAN = "shared_scan"; static final String DEFAULT_NAME_PREFIX = "row"; static final int DEFAULT_NUM_ROWS = 4; static final int DEFAULT_NUM_BATCHES = 1; + static final int DEFAULT_NUM_PARTITIONS = 1; public ExampleFfiProviderFactory() {} @@ -85,14 +96,18 @@ public byte[] encodeOptions(Map sparkOptions) { String namePrefix = sparkOptions.getOrDefault(OPT_NAME_PREFIX, DEFAULT_NAME_PREFIX); int numRows = parsePositiveInt(sparkOptions, OPT_NUM_ROWS, DEFAULT_NUM_ROWS); int numBatches = parsePositiveInt(sparkOptions, OPT_NUM_BATCHES, DEFAULT_NUM_BATCHES); + int numPartitions = parsePositiveInt(sparkOptions, OPT_NUM_PARTITIONS, DEFAULT_NUM_PARTITIONS); + boolean sharedScan = Boolean.parseBoolean(sparkOptions.getOrDefault(OPT_SHARED_SCAN, "false")); byte[] nameBytes = namePrefix.getBytes(StandardCharsets.UTF_8); ByteBuffer buf = - ByteBuffer.allocate(4 + nameBytes.length + 4 + 4).order(ByteOrder.LITTLE_ENDIAN); + ByteBuffer.allocate(4 + nameBytes.length + 4 + 4 + 4 + 1).order(ByteOrder.LITTLE_ENDIAN); buf.putInt(nameBytes.length); buf.put(nameBytes); buf.putInt(numRows); buf.putInt(numBatches); + buf.putInt(numPartitions); + buf.put((byte) (sharedScan ? 1 : 0)); return buf.array(); } @@ -103,9 +118,43 @@ public PartitionInfo[] listPartitions(byte[] optionsProtoBytes) { return new PartitionInfo[] {new PartitionInfo("p0", new byte[0], new String[0])}; } + @Override + public PartitionInfo[] listPartitions(byte[] optionsProtoBytes, byte[][] filterProtoBytes) { + // The example cannot prune its single partition, but a real bridge would inspect the + // pushed predicates here and drop partitions that cannot match. + System.out.println( + "ExampleFfiProviderFactory.listPartitions received " + + filterProtoBytes.length + + " pushed filter(s)"); + return listPartitions(optionsProtoBytes); + } + + @Override + public boolean sharedScan(byte[] optionsProtoBytes) { + // The flag is the final byte of the options blob (present only when the encoder wrote the + // trailing fields). The bridge owns its wire format, so decoding it here is fair game. + return optionsProtoBytes != null + && optionsProtoBytes.length >= 1 + && hasTrailingFields(optionsProtoBytes) + && optionsProtoBytes[optionsProtoBytes.length - 1] == 1; + } + + private static boolean hasTrailingFields(byte[] bytes) { + if (bytes.length < 4) { + return false; + } + int nameLen = ByteBuffer.wrap(bytes, 0, 4).order(ByteOrder.LITTLE_ENDIAN).getInt(); + // base layout: 4 (len) + name + 4 (num_rows) + 4 (num_batches); trailing adds 4 + 1. + return bytes.length >= 4 + nameLen + 8 + 5; + } + @Override public long createProvider(byte[] optionsProtoBytes, byte[] partitionBytes) { // The example bridge has no per-partition state; `partitionBytes` is ignored. + // The print makes provider-build amortization observable in the demo: shared-scan + // mode builds once per (executor x query) regardless of task count, while the + // per-partition path builds once per task. + System.out.println("ExampleFfiProviderFactory.createProvider building a MemTable provider"); return FfiTableProviderExampleNative.createMemTableProvider(optionsProtoBytes); } diff --git a/native/src/lib.rs b/native/src/lib.rs index 1777f19..bab6477 100644 --- a/native/src/lib.rs +++ b/native/src/lib.rs @@ -25,6 +25,7 @@ mod jni_util; mod json; mod memory; mod object_store; +mod partitioned_execution; mod proto; mod runtime_metrics; mod schema; @@ -295,9 +296,9 @@ pub extern "system" fn Java_org_apache_datafusion_DataFrame_collectDataFrame<'lo /// the Java `ArrowReader`) consumes. Each call to `next()` drives one /// `runtime().block_on(stream.next())`, so memory pressure stays bounded by the /// executor pipeline plus a single in-flight batch. -struct StreamingReader { - schema: SchemaRef, - stream: SendableRecordBatchStream, +pub(crate) struct StreamingReader { + pub(crate) schema: SchemaRef, + pub(crate) stream: SendableRecordBatchStream, } impl Iterator for StreamingReader { diff --git a/native/src/partitioned_execution.rs b/native/src/partitioned_execution.rs new file mode 100644 index 0000000..8ac3909 --- /dev/null +++ b/native/src/partitioned_execution.rs @@ -0,0 +1,169 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Per-partition execution of a planned DataFrame. +//! +//! `Java_org_apache_datafusion_DataFrame_createPartitionedExecution` plans a +//! DataFrame exactly once and returns a handle over the resulting physical +//! plan. The handle supports concurrent `executeStreamPartition` calls from +//! multiple JVM threads -- `ExecutionPlan` and `TaskContext` are `Send + Sync` +//! and every call only clones their `Arc`s before producing an independent +//! `SendableRecordBatchStream`. Re-executing the same partition index twice +//! (Spark task retry / speculative execution) opens its own stream, but only +//! succeeds when every operator in that partition's pipeline supports repeated +//! `execute()` -- stateless scans (MemTable, table providers) do, while +//! `RepartitionExec` pipelines panic on the second call because their +//! per-partition channel receivers are single-use. +//! +//! The single unsafe interleaving is `closePartitionedExecution` racing an +//! in-flight call on the same handle. The Java consumer (the Spark connector's +//! shared-scan cache) prevents it with a refcount that covers every open +//! reader; `PartitionedExecution`'s Javadoc states the contract for any other +//! caller. + +use std::sync::Arc; + +use datafusion::arrow::datatypes::SchemaRef; +use datafusion::arrow::ffi_stream::FFI_ArrowArrayStream; +use datafusion::dataframe::DataFrame; +use datafusion::execution::TaskContext; +use datafusion::physical_plan::ExecutionPlan; +use jni::objects::JClass; +use jni::sys::{jint, jlong}; +use jni::JNIEnv; + +use crate::errors::{try_unwrap_or_throw, JniResult}; +use crate::{runtime, StreamingReader}; + +pub(crate) struct PartitionedExecutionState { + plan: Arc, + task_ctx: Arc, +} + +#[no_mangle] +pub extern "system" fn Java_org_apache_datafusion_DataFrame_createPartitionedExecution<'local>( + mut env: JNIEnv<'local>, + _class: JClass<'local>, + handle: jlong, +) -> jlong { + try_unwrap_or_throw(&mut env, 0, |_env| -> JniResult { + if handle == 0 { + return Err("DataFrame handle is null".into()); + } + // Consuming, like executeStreamDataFrame: the Java side zeroes its + // handle before calling, so this Box is the last owner. + let df = unsafe { *Box::from_raw(handle as *mut DataFrame) }; + + // task_ctx() borrows; capture it before create_physical_plan consumes + // the DataFrame. + let task_ctx = Arc::new(df.task_ctx()); + let plan = runtime().block_on(df.create_physical_plan())?; + + let state = PartitionedExecutionState { plan, task_ctx }; + Ok(Box::into_raw(Box::new(state)) as jlong) + }) +} + +#[no_mangle] +pub extern "system" fn Java_org_apache_datafusion_PartitionedExecution_partitionCountNative< + 'local, +>( + mut env: JNIEnv<'local>, + _class: JClass<'local>, + handle: jlong, +) -> jint { + try_unwrap_or_throw(&mut env, 0, |_env| -> JniResult { + if handle == 0 { + return Err("PartitionedExecution handle is null".into()); + } + let state = unsafe { &*(handle as *const PartitionedExecutionState) }; + Ok(state + .plan + .properties() + .output_partitioning() + .partition_count() as jint) + }) +} + +#[no_mangle] +pub extern "system" fn Java_org_apache_datafusion_PartitionedExecution_executeStreamPartition< + 'local, +>( + mut env: JNIEnv<'local>, + _class: JClass<'local>, + handle: jlong, + partition: jint, + ffi_stream_addr: jlong, +) { + try_unwrap_or_throw(&mut env, (), |_env| -> JniResult<()> { + if handle == 0 { + return Err("PartitionedExecution handle is null".into()); + } + if ffi_stream_addr == 0 { + return Err("ffi stream address is null".into()); + } + let state = unsafe { &*(handle as *const PartitionedExecutionState) }; + + let partition_count = state + .plan + .properties() + .output_partitioning() + .partition_count(); + if partition < 0 || partition as usize >= partition_count { + return Err(format!( + "partition index {partition} out of range: plan has {partition_count} partition(s)" + ) + .into()); + } + + let plan = Arc::clone(&state.plan); + let task_ctx = Arc::clone(&state.task_ctx); + let schema: SchemaRef = plan.schema(); + + // ExecutionPlan::execute is synchronous, but operators may + // tokio::spawn at execute() time (RepartitionExec et al.), which + // requires a runtime context to be entered. + let stream = { + let _guard = runtime().enter(); + plan.execute(partition as usize, task_ctx)? + }; + + let reader = StreamingReader { schema, stream }; + let ffi = FFI_ArrowArrayStream::new(Box::new(reader)); + unsafe { + std::ptr::write(ffi_stream_addr as *mut FFI_ArrowArrayStream, ffi); + } + Ok(()) + }) +} + +#[no_mangle] +pub extern "system" fn Java_org_apache_datafusion_PartitionedExecution_closePartitionedExecution< + 'local, +>( + mut env: JNIEnv<'local>, + _class: JClass<'local>, + handle: jlong, +) { + try_unwrap_or_throw(&mut env, (), |_env| -> JniResult<()> { + if handle == 0 { + return Err("PartitionedExecution handle is null".into()); + } + drop(unsafe { Box::from_raw(handle as *mut PartitionedExecutionState) }); + Ok(()) + }) +} diff --git a/spark/native/src/widening.rs b/spark/native/src/widening.rs index 92874ad..8879507 100644 --- a/spark/native/src/widening.rs +++ b/spark/native/src/widening.rs @@ -37,11 +37,11 @@ use datafusion::catalog::{Session, TableProvider}; use datafusion::common::{DataFusionError, Result}; use datafusion::execution::TaskContext; use datafusion::logical_expr::{Expr, TableProviderFilterPushDown, TableType}; +use datafusion::physical_expr::EquivalenceProperties; +use datafusion::physical_plan::stream::RecordBatchStreamAdapter; use datafusion::physical_plan::{ DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, SendableRecordBatchStream, }; -use datafusion::physical_plan::stream::RecordBatchStreamAdapter; -use datafusion::physical_expr::EquivalenceProperties; use futures::stream::StreamExt; /// Compute the cast-target DataType for an Arrow type not directly readable @@ -85,7 +85,11 @@ fn widened_schema(inner: &ArrowSchema) -> (SchemaRef, Vec>) { for f in inner.fields() { match arrow_cast_widening(f.data_type()) { Some(target) => { - fields.push(Arc::new(Field::new(f.name(), target.clone(), f.is_nullable()))); + fields.push(Arc::new(Field::new( + f.name(), + target.clone(), + f.is_nullable(), + ))); targets.push(Some(target)); } None => { @@ -110,7 +114,11 @@ pub struct WideningTableProvider { impl WideningTableProvider { pub fn new(inner: Arc) -> Self { let (widened, targets) = widened_schema(&inner.schema()); - Self { inner, widened, targets } + Self { + inner, + widened, + targets, + } } } @@ -145,8 +153,10 @@ impl TableProvider for WideningTableProvider { let inner_plan = self.inner.scan(session, projection, filters, limit).await?; let (projected_widened, projected_targets) = match projection { Some(idxs) => { - let fields: Vec> = - idxs.iter().map(|i| Arc::clone(&self.widened.fields()[*i])).collect(); + let fields: Vec> = idxs + .iter() + .map(|i| Arc::clone(&self.widened.fields()[*i])) + .collect(); let targets: Vec> = idxs.iter().map(|i| self.targets[*i].clone()).collect(); (Arc::new(ArrowSchema::new(fields)) as SchemaRef, targets) @@ -185,7 +195,12 @@ impl WideningExec { inner_props.emission_type, inner_props.boundedness, )); - Self { inner, schema, targets, properties } + Self { + inner, + schema, + targets, + properties, + } } } @@ -250,7 +265,10 @@ impl ExecutionPlan for WideningExec { Err(e) => Err(e), Ok(batch) => cast_batch(&batch, &schema, &targets), }); - Ok(Box::pin(RecordBatchStreamAdapter::new(self.schema.clone(), mapped))) + Ok(Box::pin(RecordBatchStreamAdapter::new( + self.schema.clone(), + mapped, + ))) } } @@ -283,14 +301,26 @@ mod tests { #[test] fn unsigned_ints_widen_to_signed_wider() { assert_eq!(arrow_cast_widening(&DataType::UInt8), Some(DataType::Int16)); - assert_eq!(arrow_cast_widening(&DataType::UInt16), Some(DataType::Int32)); - assert_eq!(arrow_cast_widening(&DataType::UInt32), Some(DataType::Int64)); - assert_eq!(arrow_cast_widening(&DataType::UInt64), Some(DataType::Int64)); + assert_eq!( + arrow_cast_widening(&DataType::UInt16), + Some(DataType::Int32) + ); + assert_eq!( + arrow_cast_widening(&DataType::UInt32), + Some(DataType::Int64) + ); + assert_eq!( + arrow_cast_widening(&DataType::UInt64), + Some(DataType::Int64) + ); } #[test] fn float16_widens_to_float32() { - assert_eq!(arrow_cast_widening(&DataType::Float16), Some(DataType::Float32)); + assert_eq!( + arrow_cast_widening(&DataType::Float16), + Some(DataType::Float32) + ); } #[test] @@ -310,7 +340,10 @@ mod tests { let ns = DataType::Timestamp(TimeUnit::Nanosecond, Some(Arc::from("UTC"))); assert_eq!( arrow_cast_widening(&ns), - Some(DataType::Timestamp(TimeUnit::Microsecond, Some(Arc::from("UTC")))) + Some(DataType::Timestamp( + TimeUnit::Microsecond, + Some(Arc::from("UTC")) + )) ); let us_no_tz = DataType::Timestamp(TimeUnit::Microsecond, None); assert_eq!(arrow_cast_widening(&us_no_tz), None); diff --git a/spark/src/main/java/io/datafusion/spark/FfiProviderFactory.java b/spark/src/main/java/io/datafusion/spark/FfiProviderFactory.java index 5a9a262..506cd66 100644 --- a/spark/src/main/java/io/datafusion/spark/FfiProviderFactory.java +++ b/spark/src/main/java/io/datafusion/spark/FfiProviderFactory.java @@ -38,9 +38,9 @@ * drive Spark's data-locality scheduling. *

  • {@link #createProvider(byte[], byte[])} — executor-side, builds the bridge's {@code * Arc<dyn TableProvider>} for this specific partition, wraps it in an {@code - * FFI_TableProvider}, returns the raw boxed pointer as a {@code jlong}. The caller owns - * this pointer and is responsible for handing it to exactly one consumer (the consumer's - * {@code Drop} releases it). + * FFI_TableProvider}, returns the raw boxed pointer as a {@code jlong}. The caller owns this + * pointer and is responsible for handing it to exactly one consumer (the consumer's {@code + * Drop} releases it). * * *

    Implementations must be no-arg constructable so the Spark connector can instantiate them @@ -60,20 +60,70 @@ public interface FfiProviderFactory { * PartitionInfo}. Driver-side only. * *

    Each partition's {@code partitionBytes} ships verbatim through {@code - * DatafusionInputPartition} to the executor, where it is passed to {@link - * #createProvider(byte[], byte[])}. Use it to encode whatever slice metadata (row range, - * sub-options, file offsets, segment id, …) the bridge needs to materialise *that* partition. + * DatafusionInputPartition} to the executor, where it is passed to {@link #createProvider(byte[], + * byte[])}. Use it to encode whatever slice metadata (row range, sub-options, file offsets, + * segment id, …) the bridge needs to materialise *that* partition. * *

    Each partition's {@code preferredLocations} hostnames are returned from {@code - * InputPartition.preferredLocations()} so Spark co-locates the task with the data; empty array - * = no preference. + * InputPartition.preferredLocations()} so Spark co-locates the task with the data; empty array = + * no preference. */ PartitionInfo[] listPartitions(byte[] optionsProtoBytes); /** - * Build the underlying {@code Arc} for one partition and wrap it in an - * {@code FFI_TableProvider}. Returns the raw {@code Box::into_raw} pointer as a {@code jlong}; - * the caller takes ownership. + * Filter-aware variant of {@link #listPartitions(byte[])}. The connector calls this overload with + * the pushed-down predicates ({@code LogicalExprNode} proto bytes, one array per predicate, same + * encoding the executor later replays via {@code DataFrame.filterFromProto}). Bridges that can + * map predicates onto their partition layout (e.g. {@code segment_id = 'x'}) should prune + * partitions that cannot match — pruning here eliminates whole Spark tasks, whereas the per-task + * filter only reduces rows inside a task. + * + *

    Pruning must be conservative: only drop a partition when NO row in it can satisfy the + * conjunction of all pushed predicates. The default delegates to the filter-unaware overload (no + * pruning), which is always correct. + */ + default PartitionInfo[] listPartitions(byte[] optionsProtoBytes, byte[][] filterProtoBytes) { + return listPartitions(optionsProtoBytes); + } + + /** + * Opt into shared-scan mode for this dataset. Default {@code false} (per-partition payload mode, + * the {@link #listPartitions(byte[])} path). + * + *

    When {@code true}, the connector builds ONE provider per (executor JVM × scan) with empty + * {@code partitionBytes}, plans it once, and runs one Spark task per DataFusion output partition + * — task {@code i} streams plan partition {@code i} from the shared, cached plan. This amortises + * {@code createProvider} cost across all tasks on an executor and is the right model when the + * dataset has many small partitions or provider construction is expensive (remote metadata, + * connections). {@link #listPartitions(byte[])} and {@link #reportPartitioning(byte[])} are NOT + * called in this mode, and the scan reports {@code UnknownPartitioning} (DataFusion-native + * partitions carry no key contract). + * + *

    Determinism contract. The driver counts partitions by planning once; every executor + * re-plans independently and must arrive at the same result. A bridge returning {@code true} + * guarantees: + * + *

      + *
    • The provider's schema, partitioning, and per-partition row content are a pure function of + * {@code optionsProtoBytes}. Remote sources must pin a snapshot (version, timestamp) inside + * the options; data that compacts or moves between driver planning and executor execution + * otherwise yields wrong results that no runtime check can catch. + *
    • The provider's {@code ExecutionPlan} supports calling {@code execute(i)} more than once + * per plan instance (Spark task retry and speculative execution re-execute a partition + * index, sometimes concurrently). Stateless scans satisfy this; single-shot streams do not. + *
    + * + *

    The connector fails tasks with a clear error when the executor's partition count diverges + * from the driver's — but identical counts with different contents cannot be detected. + */ + default boolean sharedScan(byte[] optionsProtoBytes) { + return false; + } + + /** + * Build the underlying {@code Arc} for one partition and wrap it in an {@code + * FFI_TableProvider}. Returns the raw {@code Box::into_raw} pointer as a {@code jlong}; the + * caller takes ownership. * * @param optionsProtoBytes global options produced by {@link #encodeOptions(Map)} * @param partitionBytes per-partition slice payload from {@link PartitionInfo#partitionBytes()}. @@ -96,6 +146,12 @@ public interface FfiProviderFactory { *

    If a bridge implements this, it must hold the {@link ReportedPartitioning} contract: every * row in a given partition evaluates to the same tuple of key values under the declared * transforms. + * + *

    Spark 3.3+ caveat: the reported partitioning only takes effect when every {@link + * PartitionInfo} also carries {@link PartitionInfo#partitionKeyValues()} (surfaced to Spark via + * {@code HasPartitionKey}); without key values Spark ignores the declared {@code + * KeyGroupedPartitioning} entirely. Storage-partitioned joins additionally require {@code + * spark.sql.sources.v2.bucketing.enabled=true}. */ default ReportedPartitioning reportPartitioning(byte[] optionsProtoBytes) { return null; diff --git a/spark/src/main/java/io/datafusion/spark/PartitionInfo.java b/spark/src/main/java/io/datafusion/spark/PartitionInfo.java index a653eac..bdd990d 100644 --- a/spark/src/main/java/io/datafusion/spark/PartitionInfo.java +++ b/spark/src/main/java/io/datafusion/spark/PartitionInfo.java @@ -30,17 +30,28 @@ *

      *
    • {@code id} — stable, human-readable identifier for this partition (e.g. a Rerun segment * id). Surfaces in Spark UI, logs, and exception messages. Must be non-empty. - *
    • {@code partitionBytes} — opaque per-partition payload. Bridge encodes whatever the - * executor needs to materialise *this* slice (offsets, row ranges, sub-options, etc.). - * Combined with the global {@code optionsProtoBytes} in {@link - * FfiProviderFactory#createProvider(byte[], byte[])}. Empty array = no per-partition state - * (single-partition table). + *
    • {@code partitionBytes} — opaque per-partition payload. Bridge encodes whatever the executor + * needs to materialise *this* slice (offsets, row ranges, sub-options, etc.). Combined with + * the global {@code optionsProtoBytes} in {@link FfiProviderFactory#createProvider(byte[], + * byte[])}. Empty array = no per-partition state (single-partition table). *
    • {@code preferredLocations} — hostnames where this partition's data lives. Returned from - * {@code InputPartition.preferredLocations()} so Spark can co-locate the task with the - * data. Empty array = no preference. Honoured subject to {@code spark.locality.wait}. + * {@code InputPartition.preferredLocations()} so Spark can co-locate the task with the data. + * Empty array = no preference. Honoured subject to {@code spark.locality.wait}. + *
    • {@code partitionKeyValues} — optional values of the partitioning keys for every row in this + * partition, in the same order as {@link FfiProviderFactory#reportPartitioning(byte[])}'s + * declared transforms. {@code null} = no key (the default). When the bridge reports a + * partitioning AND every partition carries key values, the connector exposes them to Spark + * via {@code HasPartitionKey} — required on Spark 3.3+ for the reported {@code + * KeyGroupedPartitioning} to have any effect (and storage-partitioned joins additionally + * require {@code spark.sql.sources.v2.bucketing.enabled=true}). Values must be Java types + * that Spark's {@code CatalystTypeConverters} can convert for the key columns' data types + * (e.g. {@code String}, {@code Long}, {@code Integer}, {@code java.time.Instant}, {@code + * java.time.LocalDate}, {@code java.math.BigDecimal}), and the array length must equal the + * number of declared keys. *
    */ -public record PartitionInfo(String id, byte[] partitionBytes, String[] preferredLocations) { +public record PartitionInfo( + String id, byte[] partitionBytes, String[] preferredLocations, Object[] partitionKeyValues) { public PartitionInfo { if (id == null || id.isEmpty()) { @@ -52,5 +63,12 @@ public record PartitionInfo(String id, byte[] partitionBytes, String[] preferred if (preferredLocations == null) { preferredLocations = new String[0]; } + // partitionKeyValues stays null when absent: null and "no key" are the same state, + // and DatafusionBatch distinguishes keyed from unkeyed partitions by it. + } + + /** Without partition key values — the common case. */ + public PartitionInfo(String id, byte[] partitionBytes, String[] preferredLocations) { + this(id, partitionBytes, preferredLocations, null); } } diff --git a/spark/src/main/java/io/datafusion/spark/ReportedPartitioning.java b/spark/src/main/java/io/datafusion/spark/ReportedPartitioning.java index 06de668..01fbd1b 100644 --- a/spark/src/main/java/io/datafusion/spark/ReportedPartitioning.java +++ b/spark/src/main/java/io/datafusion/spark/ReportedPartitioning.java @@ -29,13 +29,13 @@ * via {@link FfiProviderFactory#reportPartitioning(byte[])}, the connector surfaces a {@link * org.apache.spark.sql.connector.read.partitioning.KeyGroupedPartitioning} from {@link * org.apache.spark.sql.connector.read.SupportsReportPartitioning#outputPartitioning()} — Spark's - * optimizer can then skip the shuffle ahead of joins/aggregations whose grouping keys line up - * with these transforms. + * optimizer can then skip the shuffle ahead of joins/aggregations whose grouping keys line up with + * these transforms. * *

    Contract: for any partition reported by {@link FfiProviderFactory#listPartitions(byte[])}, * every row produced by that partition must evaluate to the same tuple of key values under these - * transforms. Different partitions may share key values (Spark will fuse them); they - * must not straddle key values. + * transforms. Different partitions may share key values (Spark will fuse them); they must + * not straddle key values. * *

    The partition count Spark sees is {@code listPartitions(...).length}; it is not carried here * to keep a single source of truth. @@ -57,8 +57,8 @@ public Transform[] keys() { } /** - * Convenience: declare identity partitioning on one or more columns (a row in partition P has - * the same {@code (col1, col2, …)} values as every other row in P). + * Convenience: declare identity partitioning on one or more columns (a row in partition P has the + * same {@code (col1, col2, …)} values as every other row in P). */ public static ReportedPartitioning identity(String... columns) { if (columns == null || columns.length == 0) { diff --git a/spark/src/main/scala/io/datafusion/spark/ArrowColumnarBatchIteration.scala b/spark/src/main/scala/io/datafusion/spark/ArrowColumnarBatchIteration.scala new file mode 100644 index 0000000..30f62f8 --- /dev/null +++ b/spark/src/main/scala/io/datafusion/spark/ArrowColumnarBatchIteration.scala @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.datafusion.spark + +import org.apache.arrow.vector.FieldVector +import org.apache.arrow.vector.ipc.ArrowReader +import org.apache.spark.sql.vectorized.{ColumnVector, ColumnarBatch} + +/** + * Shared `next()`/`get()` loop for the connector's columnar readers: each `loadNextBatch()` + * yields a `VectorSchemaRoot` wrapped as a `ColumnarBatch` of [[NonClosingArrowColumnVector]]s + * (the reader owns the vectors; Spark must not close them per batch). + */ +private[spark] trait ArrowColumnarBatchIteration { + + /** The Arrow stream this reader drains. Stable for the reader's lifetime. */ + protected def arrowReader: ArrowReader + + private var currentBatch: ColumnarBatch = _ + + def next(): Boolean = { + if (currentBatch != null) { + currentBatch = null + } + if (!arrowReader.loadNextBatch()) return false + val root = arrowReader.getVectorSchemaRoot + val vectors: java.util.List[FieldVector] = root.getFieldVectors + val cols = new Array[ColumnVector](vectors.size()) + var i = 0 + while (i < vectors.size()) { + cols(i) = new NonClosingArrowColumnVector(vectors.get(i)) + i += 1 + } + val batch = new ColumnarBatch(cols) + batch.setNumRows(root.getRowCount) + currentBatch = batch + true + } + + def get(): ColumnarBatch = currentBatch +} diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionBatch.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionBatch.scala index 0464854..c2ec384 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionBatch.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionBatch.scala @@ -19,14 +19,17 @@ package io.datafusion.spark +import org.apache.spark.sql.catalyst.CatalystTypeConverters +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.connector.read.{Batch, InputPartition, PartitionReaderFactory} /** - * Spark `Batch` for a DataFusion-backed scan. Owns: - * - partition planning (driver-side: reuses the `PartitionInfo[]` already resolved by - * [[DatafusionScanBuilder]] — one task per entry; each task receives that entry's - * `partitionBytes` + `preferredLocations`) - * - per-task reader factory ([[DatafusionPartitionReaderFactory]]) + * Spark `Batch` for a DataFusion-backed scan. Driver-side partition planning: + * - [[LegacyMode]]: one task per `PartitionInfo` (resolved by [[DatafusionScanBuilder]]); when + * the bridge reported a partitioning and every entry carries key values, tasks implement + * `HasPartitionKey` so Spark can actually use the `KeyGroupedPartitioning`. + * - [[SharedScanMode]]: one task per DataFusion plan partition index. */ class DatafusionBatch(val scan: DatafusionScan) extends Batch { @@ -34,19 +37,91 @@ class DatafusionBatch(val scan: DatafusionScan) extends Batch { val projection = scan.prunedSchema.fieldNames val filterBytes: Array[Array[Byte]] = scan.pushedPredicateBytes - scan.partitions.iterator.map { p => - DatafusionInputPartition( - factoryFqcn = scan.factoryFqcn, - optionsProtoBytes = scan.optionsProtoBytes, - projectionColumnNames = projection, - filterProtoBytes = filterBytes, - partitionId = p.id, - partitionBytes = p.partitionBytes, - preferredLocs = p.preferredLocations - ).asInstanceOf[InputPartition] - }.toArray + scan.mode match { + case LegacyMode(partitions, reported) => + val keyed = DatafusionBatch.validateKeyedState(scan.factoryFqcn, partitions, reported) + partitions.iterator.map { p => + val base = DatafusionInputPartition( + factoryFqcn = scan.factoryFqcn, + optionsProtoBytes = scan.optionsProtoBytes, + projectionColumnNames = projection, + filterProtoBytes = filterBytes, + partitionId = p.id, + partitionBytes = p.partitionBytes, + preferredLocs = p.preferredLocations + ) + val out: DatafusionPartition = + if (keyed) { + DatafusionKeyedInputPartition( + base, + DatafusionBatch.toKeyRow(p.id, p.partitionKeyValues, reported)) + } else base + out.asInstanceOf[InputPartition] + }.toArray + + case SharedScanMode(scanId, numPartitions, pinnedConfig, idleTtlMs) => + Array.tabulate[InputPartition](numPartitions) { i => + DatafusionSharedScanPartition( + factoryFqcn = scan.factoryFqcn, + optionsProtoBytes = scan.optionsProtoBytes, + projectionColumnNames = projection, + filterProtoBytes = filterBytes, + scanId = scanId, + partitionIndex = i, + numPartitions = numPartitions, + pinnedConfig = pinnedConfig, + idleTtlMs = idleTtlMs + ) + } + } } override def createReaderFactory(): PartitionReaderFactory = new DatafusionPartitionReaderFactory(scan.prunedSchema) } + +private[spark] object DatafusionBatch { + + /** + * Keyed partitions require a reported partitioning AND key values on EVERY partition. A mixed + * state means the bridge violated its own contract; failing driver-side beats Spark silently + * planning without the declared grouping. + */ + def validateKeyedState( + factoryFqcn: String, + partitions: Array[PartitionInfo], + reported: ReportedPartitioning): Boolean = { + if (reported == null) { + return false + } + val withKeys = partitions.count(_.partitionKeyValues != null) + if (withKeys == 0) { + return false + } + if (withKeys != partitions.length) { + throw new IllegalStateException( + s"FfiProviderFactory '$factoryFqcn' reported a partitioning but only $withKeys of " + + s"${partitions.length} PartitionInfo entries carry partitionKeyValues; either all " + + "partitions must carry key values or none") + } + true + } + + /** + * Convert a bridge-supplied `Object[]` of key values into Spark's internal row representation + * (String → UTF8String, Instant → micros, LocalDate → days, BigDecimal → Decimal, ...). + */ + def toKeyRow( + partitionId: String, + values: Array[AnyRef], + reported: ReportedPartitioning): InternalRow = { + val keyCount = reported.keys().length + if (values.length != keyCount) { + throw new IllegalStateException( + s"PartitionInfo '$partitionId' carries ${values.length} partitionKeyValues but the " + + s"reported partitioning declares $keyCount key(s)") + } + val converted = values.map(v => CatalystTypeConverters.convertToCatalyst(v)) + new GenericInternalRow(converted) + } +} diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionColumnarPartitionReader.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionColumnarPartitionReader.scala index 5bf5a35..4a357d6 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionColumnarPartitionReader.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionColumnarPartitionReader.scala @@ -20,15 +20,14 @@ package io.datafusion.spark import org.apache.arrow.memory.RootAllocator -import org.apache.arrow.vector.FieldVector import org.apache.arrow.vector.ipc.ArrowReader import org.apache.datafusion.{DataFrame, SessionContext} import org.apache.spark.sql.connector.read.PartitionReader import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.vectorized.{ColumnVector, ColumnarBatch} +import org.apache.spark.sql.vectorized.ColumnarBatch /** - * Per-task columnar reader. Lifecycle: + * Per-task columnar reader for the per-partition payload (legacy) path. Lifecycle: * * 1. Reflectively instantiate the bridge's `FfiProviderFactory` (no-arg). * 2. `createProvider(optionsProtoBytes, partitionBytes)` — bridge builds an `Arc` DataFrame; apply pushed filters via - * `DataFrame.filterFromProto`. - * 6. `executeStream` returns an `ArrowReader`; each `loadNextBatch()` yields a - * `VectorSchemaRoot` we wrap as a `ColumnarBatch` of `NonClosingArrowColumnVector`s. + * `DataFrame.filterFromProto`, closing each intermediate frame. + * 6. `executeStream` returns an `ArrowReader`; batches surface through + * [[ArrowColumnarBatchIteration]]. */ class DatafusionColumnarPartitionReader( partition: DatafusionInputPartition, readSchema: StructType -) extends PartitionReader[ColumnarBatch] { - - private val TableName = "df_spark_partition" +) extends PartitionReader[ColumnarBatch] + with ArrowColumnarBatchIteration { private val allocator = new RootAllocator(Long.MaxValue) private val ctx: SessionContext = new SessionContext() private val factory: FfiProviderFactory = instantiateFactory(partition.factoryFqcn) - private val df: DataFrame = { + override protected val arrowReader: ArrowReader = { val rawPtr = factory.createProvider(partition.optionsProtoBytes, partition.partitionBytes) val widenedPtr = FfiHelperNative.wrapWithWidening(rawPtr) - ctx.registerFfiTable(TableName, widenedPtr) - var d = ctx.sql(buildSql()) + ctx.registerFfiTable(DatafusionSqlBuilder.PartitionTableName, widenedPtr) + var df: DataFrame = ctx.sql( + DatafusionSqlBuilder + .buildSql(partition.projectionColumnNames, DatafusionSqlBuilder.PartitionTableName)) var i = 0 while (i < partition.filterProtoBytes.length) { - d = d.filterFromProto(partition.filterProtoBytes(i)) - i += 1 - } - d - } - private val reader: ArrowReader = df.executeStream(allocator) - - private var currentBatch: ColumnarBatch = _ - - override def next(): Boolean = { - if (currentBatch != null) { - currentBatch = null - } - if (!reader.loadNextBatch()) return false - val root = reader.getVectorSchemaRoot - val vectors: java.util.List[FieldVector] = root.getFieldVectors - val cols = new Array[ColumnVector](vectors.size()) - var i = 0 - while (i < vectors.size()) { - cols(i) = new NonClosingArrowColumnVector(vectors.get(i)) + val filtered = df.filterFromProto(partition.filterProtoBytes(i)) + df.close() + df = filtered i += 1 } - val batch = new ColumnarBatch(cols) - batch.setNumRows(root.getRowCount) - currentBatch = batch - true + df.executeStream(allocator) } - override def get(): ColumnarBatch = currentBatch - override def close(): Unit = { var first: Throwable = null def safe(f: => Unit): Unit = try f catch { case t: Throwable => if (first == null) first = t else first.addSuppressed(t) } - safe(reader.close()) + safe(arrowReader.close()) safe(ctx.close()) safe(allocator.close()) if (first != null) throw first } - private def buildSql(): String = { - val cols = - if (partition.projectionColumnNames.isEmpty) "*" - else - partition.projectionColumnNames - .map(c => "\"" + c.replace("\"", "\"\"") + "\"") - .mkString(", ") - s"""SELECT $cols FROM "$TableName"""" - } - private def instantiateFactory(fqcn: String): FfiProviderFactory = { val cls = Class.forName(fqcn) cls.getDeclaredConstructor().newInstance().asInstanceOf[FfiProviderFactory] } - } diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionInputPartition.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionInputPartition.scala index c3c54c1..03d2c2e 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionInputPartition.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionInputPartition.scala @@ -19,10 +19,17 @@ package io.datafusion.spark -import org.apache.spark.sql.connector.read.InputPartition +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.connector.read.{HasPartitionKey, InputPartition} /** - * Per-task payload shipped from driver to executor via Java serialization. + * Marker for the connector's task payloads, shipped driver → executor via Java serialization. + * [[DatafusionPartitionReaderFactory]] dispatches on the concrete type. + */ +sealed trait DatafusionPartition extends InputPartition + +/** + * Per-task payload for the per-partition payload (legacy) read path. * * - `factoryFqcn`: fully-qualified class name of the bridge's `FfiProviderFactory`. The * executor reflectively instantiates this and calls `createProvider(optionsProtoBytes, @@ -46,7 +53,63 @@ final case class DatafusionInputPartition( partitionId: String, partitionBytes: Array[Byte], preferredLocs: Array[String] -) extends InputPartition { +) extends DatafusionPartition { override def preferredLocations(): Array[String] = preferredLocs } + +/** + * Legacy-path payload that additionally carries this partition's key values, precomputed + * driver-side into an [[InternalRow]]. Emitted by [[DatafusionBatch]] when the bridge reported a + * partitioning AND every `PartitionInfo` carries `partitionKeyValues` — implementing + * [[HasPartitionKey]] is what makes the reported `KeyGroupedPartitioning` visible to Spark 3.3+ + * (`DataSourceV2ScanExecBase.groupPartitions` ignores it otherwise). + */ +final case class DatafusionKeyedInputPartition( + base: DatafusionInputPartition, + keyRow: InternalRow +) extends DatafusionPartition + with HasPartitionKey { + + override def preferredLocations(): Array[String] = base.preferredLocations() + + override def partitionKey(): InternalRow = keyRow +} + +/** + * Per-task payload for shared-scan mode: task `partitionIndex` streams that DataFusion plan + * partition from the executor's cached entry (see [[SharedScanCache]]). + * + * - `scanId`: driver-minted UUID identifying this scan; the executor cache key. + * - `partitionIndex`: DataFusion output partition this task drives. + * - `numPartitions`: the driver probe's partition count; executors fail fast when their re-plan + * diverges (determinism guard). + * - `pinnedConfig`: DataFusion session knobs resolved once on the driver and replicated on + * every executor so both plan identically. + * - `idleTtlMs`: cache-entry idle eviction window, resolved from driver conf. + * + * No preferred locations: the shared plan materialises the whole dataset on whichever executors + * Spark picks; there is no per-slice host mapping in this mode. + */ +final case class DatafusionSharedScanPartition( + factoryFqcn: String, + optionsProtoBytes: Array[Byte], + projectionColumnNames: Array[String], + filterProtoBytes: Array[Array[Byte]], + scanId: String, + partitionIndex: Int, + numPartitions: Int, + pinnedConfig: PinnedSessionConfig, + idleTtlMs: Long +) extends DatafusionPartition { + + def toSpec: SharedScanSpec = + SharedScanSpec( + scanId = scanId, + factoryFqcn = factoryFqcn, + optionsProtoBytes = optionsProtoBytes, + projectionColumnNames = projectionColumnNames, + filterProtoBytes = filterProtoBytes, + pinnedConfig = pinnedConfig + ) +} diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionPartitionReaderFactory.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionPartitionReaderFactory.scala index 2a08fdb..ba5409c 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionPartitionReaderFactory.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionPartitionReaderFactory.scala @@ -38,8 +38,16 @@ class DatafusionPartitionReaderFactory(val readSchema: StructType) extends Parti "DatafusionPartitionReaderFactory: row-based read not supported; consumers must opt into columnar" ) - override def createColumnarReader(partition: InputPartition): PartitionReader[ColumnarBatch] = { - val p = partition.asInstanceOf[DatafusionInputPartition] - new DatafusionColumnarPartitionReader(p, readSchema) - } + override def createColumnarReader(partition: InputPartition): PartitionReader[ColumnarBatch] = + partition match { + case p: DatafusionInputPartition => + new DatafusionColumnarPartitionReader(p, readSchema) + case p: DatafusionKeyedInputPartition => + new DatafusionColumnarPartitionReader(p.base, readSchema) + case p: DatafusionSharedScanPartition => + new SharedScanPartitionReader(p, SharedScanCache.global) + case other => + throw new IllegalArgumentException( + s"unexpected InputPartition type: ${other.getClass.getName}") + } } diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionScan.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionScan.scala index 755fa9f..d3931ce 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionScan.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionScan.scala @@ -28,16 +28,44 @@ import org.apache.spark.sql.connector.read.partitioning.{ } import org.apache.spark.sql.types.StructType +/** + * How the scan maps to Spark tasks — resolved once, driver-side, in + * [[DatafusionScanBuilder.build]]. + */ +sealed trait DatafusionScanMode extends Serializable + +/** + * Per-partition payload mode: one task per [[PartitionInfo]], each task builds its own provider + * from that entry's `partitionBytes`. `reported` is the bridge's optional partitioning + * declaration (may be null). + */ +final case class LegacyMode( + partitions: Array[PartitionInfo], + reported: ReportedPartitioning +) extends DatafusionScanMode + +/** + * Shared-scan mode: one cached provider + plan per (executor × scan), `numPartitions` tasks each + * driving one DataFusion output partition. See [[FfiProviderFactory#sharedScan]] for the + * determinism contract. + */ +final case class SharedScanMode( + scanId: String, + numPartitions: Int, + pinnedConfig: PinnedSessionConfig, + idleTtlMs: Long +) extends DatafusionScanMode + /** * Read plan for a DataFusion-backed scan. Holds pruning state, the pushed predicates (for - * `description()` / `explain(True)`), and the corresponding `LogicalExprNode` proto byte arrays - * the executor applies via `DataFrame.filterFromProto`. + * `description()` / `explain(True)`), the corresponding `LogicalExprNode` proto byte arrays the + * executor applies via `DataFrame.filterFromProto`, and the driver-resolved + * [[DatafusionScanMode]]. * - * Also carries the driver-resolved `PartitionInfo[]` (so [[DatafusionBatch]] doesn't re-call - * `listPartitions`) and the optional bridge-declared [[ReportedPartitioning]]; when present, the - * scan surfaces a `KeyGroupedPartitioning` via `SupportsReportPartitioning` so Spark's optimizer - * can skip shuffles ahead of compatible joins/aggregations. When absent, an - * `UnknownPartitioning(partitions.length)` is reported (still correct, just no shuffle elision). + * Legacy mode with a bridge-declared [[ReportedPartitioning]] surfaces `KeyGroupedPartitioning` + * via `SupportsReportPartitioning`; note Spark 3.3+ only consumes it when the input partitions + * also implement `HasPartitionKey` (see [[DatafusionBatch]]). Shared-scan mode always reports + * `UnknownPartitioning` — DataFusion-native partitions carry no key contract. */ class DatafusionScan( val factoryFqcn: String, @@ -46,21 +74,31 @@ class DatafusionScan( val prunedSchema: StructType, val pushedPredicates: Array[Predicate], val pushedPredicateBytes: Array[Array[Byte]], - val partitions: Array[PartitionInfo], - val reportedPartitioning: ReportedPartitioning + val mode: DatafusionScanMode ) extends Scan with SupportsReportPartitioning { override def readSchema(): StructType = prunedSchema - override def description(): String = + override def description(): String = { + val modeDesc = mode match { + case LegacyMode(partitions, reported) => + s"mode=per-partition, partitions=${partitions.length}," + + s" reportedPartitioning=${if (reported == null) "unknown" else "key-grouped"}" + case SharedScanMode(scanId, n, _, _) => + s"mode=shared-scan, scanId=$scanId, partitions=$n" + } s"DatafusionScan(factory=$factoryFqcn, projection=${prunedSchema.fieldNames.mkString(",")}," + - s" pushedPredicates=${pushedPredicates.length}, partitions=${partitions.length}," + - s" reportedPartitioning=${if (reportedPartitioning == null) "unknown" else "key-grouped"})" + s" pushedPredicates=${pushedPredicates.length}, $modeDesc)" + } override def toBatch: Batch = new DatafusionBatch(this) - override def outputPartitioning(): Partitioning = - if (reportedPartitioning == null) new UnknownPartitioning(partitions.length) - else new KeyGroupedPartitioning(reportedPartitioning.keys().toArray, partitions.length) + override def outputPartitioning(): Partitioning = mode match { + case LegacyMode(partitions, reported) => + if (reported == null) new UnknownPartitioning(partitions.length) + else new KeyGroupedPartitioning(reported.keys().toArray, partitions.length) + case SharedScanMode(_, numPartitions, _, _) => + new UnknownPartitioning(numPartitions) + } } diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionScanBuilder.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionScanBuilder.scala index f1fe354..45eaa87 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionScanBuilder.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionScanBuilder.scala @@ -19,8 +19,11 @@ package io.datafusion.spark +import java.util.UUID + import org.apache.spark.sql.connector.expressions.filter.Predicate import org.apache.spark.sql.connector.read.{Scan, ScanBuilder, SupportsPushDownRequiredColumns, SupportsPushDownV2Filters} +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType /** @@ -31,11 +34,14 @@ import org.apache.spark.sql.types.StructType * translator (see [[SparkPredicateTranslator]]) only emits proto for predicates it can encode * losslessly — anything else returns `None` and lands in residuals. * - * `build()` is also where we resolve driver-side facts that the optimizer needs *before* it - * starts asking the [[DatafusionScan]] about its output partitioning: the partition list - * (`listPartitions`) and the bridge's optional [[ReportedPartitioning]]. Resolving both here once - * and threading them onto the Scan keeps `DatafusionBatch.planInputPartitions` shuffle-free and - * lets `outputPartitioning()` answer without an extra factory call per query. + * `build()` resolves the driver-side facts the optimizer needs *before* it starts asking the + * [[DatafusionScan]] about its output partitioning. Spark guarantees `pushPredicates` and + * `pruneColumns` run first, so both paths see the final projection + filters: + * - per-partition payload mode: `listPartitions(opts, filters)` (filter-aware overload — the + * bridge can prune whole partitions) + the optional [[ReportedPartitioning]]; + * - shared-scan mode: a probe build of the provider + plan (via the same code path executors + * use) to count DataFusion output partitions, plus a freshly minted scanId and the pinned + * session config that makes executor re-plans comparable. */ class DatafusionScanBuilder( factoryFqcn: String, @@ -79,13 +85,9 @@ class DatafusionScanBuilder( override def build(): Scan = { val factory = instantiateFactory(factoryFqcn) - val partitions: Array[PartitionInfo] = factory.listPartitions(optionsProtoBytes) - if (partitions == null || partitions.isEmpty) { - throw new IllegalStateException( - s"FfiProviderFactory '$factoryFqcn' returned no partitions to scan" - ) - } - val reported: ReportedPartitioning = factory.reportPartitioning(optionsProtoBytes) + val mode: DatafusionScanMode = + if (factory.sharedScan(optionsProtoBytes)) buildSharedScanMode() + else buildLegacyMode(factory) new DatafusionScan( factoryFqcn, optionsProtoBytes, @@ -93,9 +95,53 @@ class DatafusionScanBuilder( pruned, pushed, pushedBytes, - partitions, - reported + mode + ) + } + + private def buildLegacyMode(factory: FfiProviderFactory): LegacyMode = { + val partitions: Array[PartitionInfo] = + factory.listPartitions(optionsProtoBytes, pushedBytes) + if (partitions == null || partitions.isEmpty) { + throw new IllegalStateException( + s"FfiProviderFactory '$factoryFqcn' returned no partitions to scan" + ) + } + LegacyMode(partitions, factory.reportPartitioning(optionsProtoBytes)) + } + + /** + * Driver plan probe: build the provider + plan exactly as executors will (same widening, SQL, + * filters, pinned config — one code path in [[NativeSharedScanResources]]) and read the + * physical plan's output partition count. All Spark conf is resolved here, driver-side; + * executors only see the shipped copies. + */ + private def buildSharedScanMode(): SharedScanMode = { + val conf = SQLConf.get + val pinned = PinnedSessionConfig.fromConf(conf) + val idleTtlMs = PinnedSessionConfig.idleTtlMs(conf) + val scanId = UUID.randomUUID().toString + + val probeSpec = SharedScanSpec( + scanId = scanId, + factoryFqcn = factoryFqcn, + optionsProtoBytes = optionsProtoBytes, + projectionColumnNames = pruned.fieldNames, + filterProtoBytes = pushedBytes, + pinnedConfig = pinned ) + val probe = NativeSharedScanResources.build(probeSpec) + val numPartitions = + try { + probe.partitionCount + } finally { + probe.close() + } + if (numPartitions <= 0) { + throw new IllegalStateException( + s"shared-scan probe for factory '$factoryFqcn' produced a plan with no partitions") + } + SharedScanMode(scanId, numPartitions, pinned, idleTtlMs) } private def instantiateFactory(fqcn: String): FfiProviderFactory = { diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionSqlBuilder.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionSqlBuilder.scala new file mode 100644 index 0000000..a209bed --- /dev/null +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionSqlBuilder.scala @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.datafusion.spark + +/** Shared SQL construction for the per-task (legacy) and shared-scan read paths. */ +private[spark] object DatafusionSqlBuilder { + + /** Registration name for the per-task provider in legacy mode. */ + val PartitionTableName = "df_spark_partition" + + /** Registration name for the per-executor provider in shared-scan mode. */ + val SharedTableName = "df_spark_shared" + + /** `SELECT FROM "

  • "`. */ + def buildSql(projectionColumnNames: Array[String], tableName: String): String = { + val cols = + if (projectionColumnNames.isEmpty) "*" + else + projectionColumnNames + .map(c => "\"" + c.replace("\"", "\"\"") + "\"") + .mkString(", ") + s"""SELECT $cols FROM "$tableName"""" + } +} diff --git a/spark/src/main/scala/io/datafusion/spark/NativeSharedScanResources.scala b/spark/src/main/scala/io/datafusion/spark/NativeSharedScanResources.scala new file mode 100644 index 0000000..2351484 --- /dev/null +++ b/spark/src/main/scala/io/datafusion/spark/NativeSharedScanResources.scala @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.datafusion.spark + +import org.apache.arrow.memory.{BufferAllocator, RootAllocator} +import org.apache.arrow.vector.ipc.ArrowReader +import org.apache.datafusion.{DataFrame, PartitionedExecution, SessionContext} +import org.apache.spark.internal.Logging + +/** + * JNI-backed shared-scan entry: one provider, one `SessionContext`, one planned + * [[PartitionedExecution]]. + * + * The build sequence is the single code path for BOTH the driver-side partition-count probe and + * every executor's cache entry — identical widening, registration, SQL, filters, and pinned + * session config are what make the partition count comparable across machines (the bridge's + * determinism contract covers the rest). + */ +private[spark] final class NativeSharedScanResources( + allocator: RootAllocator, + ctx: SessionContext, + execution: PartitionedExecution +) extends SharedScanResources { + + override def partitionCount: Int = execution.partitionCount() + + override def newTaskAllocator(name: String): BufferAllocator = + allocator.newChildAllocator(name, 0, Long.MaxValue) + + override def openPartitionStream( + partition: Int, + taskAllocator: BufferAllocator): ArrowReader = + execution.executeStream(partition, taskAllocator) + + override def close(): Unit = { + var first: Throwable = null + def safe(f: => Unit): Unit = + try f + catch { case t: Throwable => if (first == null) first = t else first.addSuppressed(t) } + safe(execution.close()) + safe(ctx.close()) + safe(allocator.close()) + if (first != null) throw first + } +} + +private[spark] object NativeSharedScanResources extends Logging { + + def build(spec: SharedScanSpec): SharedScanResources = { + logInfo( + s"Building shared-scan entry for scanId=${spec.scanId} " + + s"(factory=${spec.factoryFqcn}, filters=${spec.filterProtoBytes.length})") + + val factory = Class + .forName(spec.factoryFqcn) + .getDeclaredConstructor() + .newInstance() + .asInstanceOf[FfiProviderFactory] + + val allocator = new RootAllocator(Long.MaxValue) + var ctx: SessionContext = null + try { + // Shared mode builds the dataset-wide provider: empty partitionBytes, like the + // driver-side schema probe. DataFusion-native partitioning replaces listPartitions. + val rawPtr = factory.createProvider(spec.optionsProtoBytes, Array.emptyByteArray) + val widenedPtr = FfiHelperNative.wrapWithWidening(rawPtr) + + ctx = spec.pinnedConfig.buildContext() + ctx.registerFfiTable(DatafusionSqlBuilder.SharedTableName, widenedPtr) + + var df: DataFrame = ctx.sql( + DatafusionSqlBuilder + .buildSql(spec.projectionColumnNames, DatafusionSqlBuilder.SharedTableName)) + var i = 0 + while (i < spec.filterProtoBytes.length) { + val filtered = df.filterFromProto(spec.filterProtoBytes(i)) + df.close() + df = filtered + i += 1 + } + + val execution = df.toPartitionedExecution() + new NativeSharedScanResources(allocator, ctx, execution) + } catch { + case t: Throwable => + if (ctx != null) { + try ctx.close() + catch { case suppressed: Throwable => t.addSuppressed(suppressed) } + } + try allocator.close() + catch { case suppressed: Throwable => t.addSuppressed(suppressed) } + throw t + } + } +} diff --git a/spark/src/main/scala/io/datafusion/spark/PinnedSessionConfig.scala b/spark/src/main/scala/io/datafusion/spark/PinnedSessionConfig.scala new file mode 100644 index 0000000..af71b4c --- /dev/null +++ b/spark/src/main/scala/io/datafusion/spark/PinnedSessionConfig.scala @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.datafusion.spark + +import org.apache.datafusion.SessionContext +import org.apache.spark.sql.internal.SQLConf + +/** + * DataFusion session knobs pinned by the driver and replicated verbatim on every executor in + * shared-scan mode. + * + * DataFusion's default `SessionConfig` derives `target_partitions` from the machine's core count, + * so a plan that yields N partitions on the driver could yield M ≠ N on a differently-sized + * executor — and partition-indexed execution would silently drop or duplicate data. The driver + * resolves these values once in `DatafusionScanBuilder.build()`, ships them inside every + * [[DatafusionSharedScanPartition]], and both the driver probe and the executors construct their + * `SessionContext` exclusively through [[buildContext]]. + * + * `options` additionally disables the optimizer's plan-reshaping repartition passes so the + * physical partitioning is exactly what the provider's `scan()` reports, on every machine. + */ +final case class PinnedSessionConfig( + targetPartitions: Int, + batchSize: Int, + options: Vector[(String, String)] +) extends Serializable { + + def buildContext(): SessionContext = { + val builder = SessionContext + .builder() + .targetPartitions(targetPartitions) + .batchSize(batchSize) + options.foreach { case (k, v) => builder.setOption(k, v) } + builder.build() + } +} + +object PinnedSessionConfig { + + val TargetPartitionsConf = "spark.datafusion.sharedScan.targetPartitions" + val BatchSizeConf = "spark.datafusion.sharedScan.batchSize" + val IdleTtlConf = "spark.datafusion.sharedScan.idleTtlMs" + + val DefaultTargetPartitions = 8 + val DefaultBatchSize = 8192 + val DefaultIdleTtlMs = 120000L + + /** + * Optimizer knobs that must not vary with the host. Round-robin repartition and file-scan + * repartition would let the optimizer change the plan's output partition count based on + * `target_partitions` heuristics; statistics collection could steer per-host plan differences. + */ + private val DeterminismOptions: Vector[(String, String)] = Vector( + "datafusion.optimizer.enable_round_robin_repartition" -> "false", + "datafusion.optimizer.repartition_file_scans" -> "false", + "datafusion.execution.collect_statistics" -> "false" + ) + + /** + * Resolve the pinned config from the driver's session conf. Called exactly once per scan, on + * the driver; executors never read Spark conf for these values — they use the shipped copy. + */ + def fromConf(conf: SQLConf): PinnedSessionConfig = { + PinnedSessionConfig( + targetPartitions = + conf.getConfString(TargetPartitionsConf, DefaultTargetPartitions.toString).toInt, + batchSize = conf.getConfString(BatchSizeConf, DefaultBatchSize.toString).toInt, + options = DeterminismOptions + ) + } + + def idleTtlMs(conf: SQLConf): Long = + conf.getConfString(IdleTtlConf, DefaultIdleTtlMs.toString).toLong +} diff --git a/spark/src/main/scala/io/datafusion/spark/SharedScanCache.scala b/spark/src/main/scala/io/datafusion/spark/SharedScanCache.scala new file mode 100644 index 0000000..a092eb7 --- /dev/null +++ b/spark/src/main/scala/io/datafusion/spark/SharedScanCache.scala @@ -0,0 +1,197 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.datafusion.spark + +import java.util.concurrent.{ConcurrentHashMap, Executors, ScheduledExecutorService, TimeUnit} + +import org.apache.arrow.memory.BufferAllocator +import org.apache.arrow.vector.ipc.ArrowReader + +/** + * Everything the driver resolved that an executor needs to rebuild the shared scan: identity + * (scanId) plus the exact build inputs (factory, options, projection, filters, pinned config). + */ +final case class SharedScanSpec( + scanId: String, + factoryFqcn: String, + optionsProtoBytes: Array[Byte], + projectionColumnNames: Array[String], + filterProtoBytes: Array[Array[Byte]], + pinnedConfig: PinnedSessionConfig +) + +/** + * What one cached shared-scan entry exposes to readers. Implemented by + * [[NativeSharedScanResources]] (JNI-backed) and by fakes in tests. + */ +trait SharedScanResources extends AutoCloseable { + + /** Output partition count of the planned physical plan. */ + def partitionCount: Int + + /** Child allocator for one task's reader; closed by the task, before release. */ + def newTaskAllocator(name: String): BufferAllocator + + /** Open an independent stream over one plan partition. Concurrent-safe. */ + def openPartitionStream(partition: Int, taskAllocator: BufferAllocator): ArrowReader +} + +/** + * Executor-JVM cache of shared-scan entries, keyed by the driver-minted scanId. + * + * Semantics: + * - `acquire` builds the entry exactly once per attempt wave: the first caller builds under + * the entry's lock, concurrent callers block and share the result. Each successful acquire + * increments a refcount that the caller MUST pair with `release(scanId)`. + * - Build failures propagate to the builder AND all waiters of that attempt, and are not + * cached: the next acquire rebuilds. + * - Eviction closes entries with refcount 0 that have been idle longer than their TTL. The + * refcount covers every open reader, so native close never races an in-flight stream. + * Acquire after eviction rebuilds — correct, just slower. + * + * The cache itself is JNI-free: the entry builder is injected, so tests run without native libs. + */ +final class SharedScanCache( + buildEntry: SharedScanSpec => SharedScanResources, + nanoClock: () => Long = () => System.nanoTime() +) { + + /** + * Per-scanId slot. All state transitions are guarded by `this` (the holder's monitor); the + * build itself also runs under the monitor, which is what blocks concurrent acquirers of the + * same scan until the entry exists. + */ + private final class EntryHolder(spec: SharedScanSpec, idleTtlMs: Long) { + private var resources: SharedScanResources = _ + private var refCount: Int = 0 + private var lastReleaseNanos: Long = nanoClock() + private var closed: Boolean = false + + /** Returns the resources with refcount incremented, or None if this holder was evicted. */ + def acquire(): Option[SharedScanResources] = synchronized { + if (closed) return None + if (resources == null) { + resources = buildEntry(spec) // throws -> caller removes holder + } + refCount += 1 + Some(resources) + } + + def release(): Unit = synchronized { + refCount -= 1 + lastReleaseNanos = nanoClock() + } + + /** Close if idle past TTL; returns true when this holder is now closed. */ + def closeIfIdle(nowNanos: Long): Boolean = synchronized { + if (closed) return true + val idle = refCount == 0 && + (nowNanos - lastReleaseNanos) >= TimeUnit.MILLISECONDS.toNanos(idleTtlMs) + if (idle) forceCloseLocked() + closed + } + + def forceClose(): Unit = synchronized { forceCloseLocked() } + + private def forceCloseLocked(): Unit = { + if (!closed) { + closed = true + if (resources != null) { + val r = resources + resources = null + r.close() + } + } + } + } + + private val entries = new ConcurrentHashMap[String, EntryHolder]() + + def acquire(spec: SharedScanSpec, idleTtlMs: Long): SharedScanResources = { + while (true) { + val holder = + entries.computeIfAbsent(spec.scanId, _ => new EntryHolder(spec, idleTtlMs)) + val acquired = + try { + holder.acquire() + } catch { + case t: Throwable => + // Build failed: drop the holder so the next acquire rebuilds, then propagate. + entries.remove(spec.scanId, holder) + throw t + } + acquired match { + case Some(resources) => return resources + case None => + // Holder was evicted between map lookup and acquire; retry with a fresh one. + entries.remove(spec.scanId, holder) + } + } + throw new IllegalStateException("unreachable") + } + + def release(scanId: String): Unit = { + val holder = entries.get(scanId) + if (holder == null) { + throw new IllegalStateException( + s"release($scanId) without a cached entry: unbalanced acquire/release") + } + holder.release() + } + + /** Close and remove every idle-past-TTL entry. Called by the evictor daemon and by tests. */ + private[spark] def evictIdleNow(): Unit = { + val now = nanoClock() + entries.forEach { (scanId, holder) => + if (holder.closeIfIdle(now)) { + entries.remove(scanId, holder) + } + } + } + + /** Close everything regardless of refcounts. JVM-shutdown path only. */ + def shutdown(): Unit = { + entries.forEach { (_, holder) => holder.forceClose() } + entries.clear() + } +} + +object SharedScanCache { + + /** Evictor period. Short relative to any sane TTL; cheap when the map is empty. */ + private val EvictorPeriodMs = 5000L + + /** JVM singleton used by executor tasks. Lazily started together with its evictor daemon. */ + lazy val global: SharedScanCache = { + val cache = new SharedScanCache(NativeSharedScanResources.build) + val evictor: ScheduledExecutorService = Executors.newSingleThreadScheduledExecutor { r => + val t = new Thread(r, "datafusion-shared-scan-evictor") + t.setDaemon(true) + t + } + evictor.scheduleWithFixedDelay( + () => cache.evictIdleNow(), + EvictorPeriodMs, + EvictorPeriodMs, + TimeUnit.MILLISECONDS) + Runtime.getRuntime.addShutdownHook(new Thread(() => cache.shutdown())) + cache + } +} diff --git a/spark/src/main/scala/io/datafusion/spark/SharedScanPartitionReader.scala b/spark/src/main/scala/io/datafusion/spark/SharedScanPartitionReader.scala new file mode 100644 index 0000000..01f62a2 --- /dev/null +++ b/spark/src/main/scala/io/datafusion/spark/SharedScanPartitionReader.scala @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.datafusion.spark + +import org.apache.arrow.memory.BufferAllocator +import org.apache.arrow.vector.ipc.ArrowReader +import org.apache.spark.TaskContext +import org.apache.spark.sql.connector.read.PartitionReader +import org.apache.spark.sql.vectorized.ColumnarBatch + +/** + * Shared-scan task reader: acquires the executor's cached (provider, plan) entry and streams ONE + * DataFusion plan partition from it. The acquire/release refcount pair brackets the reader's + * whole lifetime, so the cache can never close the native plan under an open stream. + */ +class SharedScanPartitionReader( + partition: DatafusionSharedScanPartition, + cache: SharedScanCache +) extends PartitionReader[ColumnarBatch] + with ArrowColumnarBatchIteration { + + private val resources: SharedScanResources = cache.acquire(partition.toSpec, partition.idleTtlMs) + + // Determinism guard: the driver counted partitions by planning once; if this executor's + // re-plan disagrees, partition indices are meaningless and every task of the scan must fail + // rather than silently drop or duplicate data. + if (resources.partitionCount != partition.numPartitions) { + val executorCount = resources.partitionCount + cache.release(partition.scanId) + throw new IllegalStateException( + s"shared-scan determinism violation for scanId=${partition.scanId}: driver planned " + + s"${partition.numPartitions} partition(s) but this executor planned $executorCount. " + + "The provider's partitioning must be a pure function of optionsProtoBytes; pin your " + + "source snapshot (see FfiProviderFactory.sharedScan).") + } + + private val taskAllocator: BufferAllocator = { + val attempt = Option(TaskContext.get()).map(_.taskAttemptId()).getOrElse(-1L) + resources.newTaskAllocator( + s"shared-${partition.scanId}-p${partition.partitionIndex}-attempt$attempt") + } + + override protected val arrowReader: ArrowReader = + try { + resources.openPartitionStream(partition.partitionIndex, taskAllocator) + } catch { + case t: Throwable => + try taskAllocator.close() + catch { case suppressed: Throwable => t.addSuppressed(suppressed) } + cache.release(partition.scanId) + throw t + } + + override def close(): Unit = { + var first: Throwable = null + def safe(f: => Unit): Unit = + try f + catch { case t: Throwable => if (first == null) first = t else first.addSuppressed(t) } + safe(arrowReader.close()) + safe(taskAllocator.close()) + // Release LAST: the refcount must cover the open stream and the task allocator. + safe(cache.release(partition.scanId)) + if (first != null) throw first + } +} diff --git a/spark/src/test/scala/io/datafusion/spark/FfiProviderFactoryDefaultsTest.scala b/spark/src/test/scala/io/datafusion/spark/FfiProviderFactoryDefaultsTest.scala new file mode 100644 index 0000000..4dcb3a9 --- /dev/null +++ b/spark/src/test/scala/io/datafusion/spark/FfiProviderFactoryDefaultsTest.scala @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.datafusion.spark + +import java.util.{Map => JMap} + +import org.scalatest.funsuite.AnyFunSuite + +class FfiProviderFactoryDefaultsTest extends AnyFunSuite { + + /** Minimal factory implementing only the abstract methods — exercises the defaults. */ + private class MinimalFactory extends FfiProviderFactory { + var lastListPartitionsOpts: Array[Byte] = _ + + override def encodeOptions(sparkOptions: JMap[String, String]): Array[Byte] = + Array.emptyByteArray + + override def listPartitions(optionsProtoBytes: Array[Byte]): Array[PartitionInfo] = { + lastListPartitionsOpts = optionsProtoBytes + Array(new PartitionInfo("p0", Array.emptyByteArray, Array.empty[String])) + } + + override def createProvider( + optionsProtoBytes: Array[Byte], + partitionBytes: Array[Byte]): Long = 0L + } + + test("sharedScan defaults to false") { + assert(!new MinimalFactory().sharedScan(Array[Byte](1, 2, 3))) + } + + test("filter-aware listPartitions delegates to the filter-unaware overload") { + val factory = new MinimalFactory + val opts = Array[Byte](7, 8) + val filters = Array(Array[Byte](1), Array[Byte](2)) + val partitions = factory.listPartitions(opts, filters) + assert(partitions.length == 1) + assert(partitions(0).id == "p0") + assert(factory.lastListPartitionsOpts eq opts) + } + + test("reportPartitioning defaults to null") { + assert(new MinimalFactory().reportPartitioning(Array.emptyByteArray) == null) + } + + test("PartitionInfo 3-arg constructor leaves partitionKeyValues null") { + val p = new PartitionInfo("p0", Array.emptyByteArray, Array.empty[String]) + assert(p.partitionKeyValues() == null) + } + + test("PartitionInfo 4-arg constructor carries key values") { + val p = new PartitionInfo( + "p0", + Array.emptyByteArray, + Array.empty[String], + Array[AnyRef]("segment-a", Long.box(42L))) + assert(p.partitionKeyValues().length == 2) + assert(p.partitionKeyValues()(0) == "segment-a") + } +} diff --git a/spark/src/test/scala/io/datafusion/spark/PartitionKeyConversionTest.scala b/spark/src/test/scala/io/datafusion/spark/PartitionKeyConversionTest.scala new file mode 100644 index 0000000..e2f876d --- /dev/null +++ b/spark/src/test/scala/io/datafusion/spark/PartitionKeyConversionTest.scala @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.datafusion.spark + +import org.apache.spark.unsafe.types.UTF8String +import org.scalatest.funsuite.AnyFunSuite + +class PartitionKeyConversionTest extends AnyFunSuite { + + private def info(id: String, keys: Array[AnyRef]): PartitionInfo = + new PartitionInfo(id, Array.emptyByteArray, Array.empty[String], keys) + + private def infoNoKeys(id: String): PartitionInfo = + new PartitionInfo(id, Array.emptyByteArray, Array.empty[String]) + + test("String and Long key values convert to catalyst representations") { + val reported = ReportedPartitioning.identity("segment_id", "bucket") + val row = + DatafusionBatch.toKeyRow("p0", Array[AnyRef]("segment-a", Long.box(42L)), reported) + assert(row.numFields == 2) + assert(row.get(0, org.apache.spark.sql.types.StringType) == UTF8String.fromString("segment-a")) + assert(row.getLong(1) == 42L) + } + + test("arity mismatch between key values and declared keys throws") { + val reported = ReportedPartitioning.identity("segment_id", "bucket") + val e = intercept[IllegalStateException] { + DatafusionBatch.toKeyRow("p0", Array[AnyRef]("only-one"), reported) + } + assert(e.getMessage.contains("declares 2 key(s)")) + } + + test("keyed state requires reported partitioning") { + val partitions = Array(info("p0", Array[AnyRef]("a"))) + assert(!DatafusionBatch.validateKeyedState("F", partitions, null)) + } + + test("no partitions with keys means unkeyed, even with reported partitioning") { + val reported = ReportedPartitioning.identity("segment_id") + val partitions = Array(infoNoKeys("p0"), infoNoKeys("p1")) + assert(!DatafusionBatch.validateKeyedState("F", partitions, reported)) + } + + test("all partitions with keys means keyed") { + val reported = ReportedPartitioning.identity("segment_id") + val partitions = + Array(info("p0", Array[AnyRef]("a")), info("p1", Array[AnyRef]("b"))) + assert(DatafusionBatch.validateKeyedState("F", partitions, reported)) + } + + test("mixed keyed and unkeyed partitions throw driver-side") { + val reported = ReportedPartitioning.identity("segment_id") + val partitions = Array(info("p0", Array[AnyRef]("a")), infoNoKeys("p1")) + val e = intercept[IllegalStateException] { + DatafusionBatch.validateKeyedState("F", partitions, reported) + } + assert(e.getMessage.contains("only 1 of 2")) + } +} diff --git a/spark/src/test/scala/io/datafusion/spark/SharedScanCacheTest.scala b/spark/src/test/scala/io/datafusion/spark/SharedScanCacheTest.scala new file mode 100644 index 0000000..08acc97 --- /dev/null +++ b/spark/src/test/scala/io/datafusion/spark/SharedScanCacheTest.scala @@ -0,0 +1,195 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.datafusion.spark + +import java.util.concurrent.{CountDownLatch, Executors, TimeUnit} +import java.util.concurrent.atomic.{AtomicInteger, AtomicLong} + +import org.apache.arrow.memory.BufferAllocator +import org.apache.arrow.vector.ipc.ArrowReader +import org.scalatest.funsuite.AnyFunSuite + +class SharedScanCacheTest extends AnyFunSuite { + + private def spec(scanId: String): SharedScanSpec = + SharedScanSpec( + scanId = scanId, + factoryFqcn = "test.Factory", + optionsProtoBytes = Array.emptyByteArray, + projectionColumnNames = Array.empty, + filterProtoBytes = Array.empty, + pinnedConfig = PinnedSessionConfig(8, 8192, Vector.empty) + ) + + /** JNI-free fake entry; records close. */ + private final class FakeResources extends SharedScanResources { + @volatile var closed = false + override def partitionCount: Int = 3 + override def newTaskAllocator(name: String): BufferAllocator = + throw new UnsupportedOperationException("not used in cache tests") + override def openPartitionStream(p: Int, a: BufferAllocator): ArrowReader = + throw new UnsupportedOperationException("not used in cache tests") + override def close(): Unit = closed = true + } + + private final class Fixture { + val clock = new AtomicLong(0L) + val buildCount = new AtomicInteger(0) + var failBuilds = false + var lastBuilt: FakeResources = _ + + val cache = new SharedScanCache( + buildEntry = _ => { + buildCount.incrementAndGet() + if (failBuilds) throw new RuntimeException("synthetic build failure") + lastBuilt = new FakeResources + lastBuilt + }, + nanoClock = () => clock.get() + ) + + def advanceMillis(ms: Long): Unit = clock.addAndGet(TimeUnit.MILLISECONDS.toNanos(ms)) + } + + test("acquire builds once, second acquire reuses, refcount pairs with release") { + val f = new Fixture + val r1 = f.cache.acquire(spec("s1"), idleTtlMs = 1000) + val r2 = f.cache.acquire(spec("s1"), idleTtlMs = 1000) + assert(f.buildCount.get() == 1) + assert(r1 eq r2) + f.cache.release("s1") + f.cache.release("s1") + } + + test("concurrent acquires build exactly once") { + val f = new Fixture + val n = 8 + val pool = Executors.newFixedThreadPool(n) + val ready = new CountDownLatch(n) + val go = new CountDownLatch(1) + try { + val futures = (0 until n).map { _ => + pool.submit { () => + ready.countDown() + go.await() + f.cache.acquire(spec("s1"), idleTtlMs = 1000) + } + } + ready.await() + go.countDown() + val results = futures.map(_.get(10, TimeUnit.SECONDS)) + assert(f.buildCount.get() == 1) + assert(results.forall(_ eq results.head)) + (0 until n).foreach(_ => f.cache.release("s1")) + } finally { + pool.shutdownNow() + } + } + + test("build failure propagates and is not cached") { + val f = new Fixture + f.failBuilds = true + val e = intercept[RuntimeException](f.cache.acquire(spec("s1"), idleTtlMs = 1000)) + assert(e.getMessage == "synthetic build failure") + f.failBuilds = false + val r = f.cache.acquire(spec("s1"), idleTtlMs = 1000) + assert(f.buildCount.get() == 2) + assert(r eq f.lastBuilt) + f.cache.release("s1") + } + + test("idle entry past TTL is evicted and closed") { + val f = new Fixture + f.cache.acquire(spec("s1"), idleTtlMs = 1000) + f.cache.release("s1") + val built = f.lastBuilt + f.advanceMillis(999) + f.cache.evictIdleNow() + assert(!built.closed) + f.advanceMillis(2) + f.cache.evictIdleNow() + assert(built.closed) + } + + test("entry in use is never evicted, regardless of idle time") { + val f = new Fixture + f.cache.acquire(spec("s1"), idleTtlMs = 1000) + val built = f.lastBuilt + f.advanceMillis(100000) + f.cache.evictIdleNow() + assert(!built.closed) + f.cache.release("s1") + f.advanceMillis(100000) + f.cache.evictIdleNow() + assert(built.closed) + } + + test("release then reacquire within TTL resets idleness") { + val f = new Fixture + f.cache.acquire(spec("s1"), idleTtlMs = 1000) + f.cache.release("s1") + f.advanceMillis(900) + // Next task wave lands before TTL: same entry, no rebuild. + val r = f.cache.acquire(spec("s1"), idleTtlMs = 1000) + assert(f.buildCount.get() == 1) + assert(r eq f.lastBuilt) + f.cache.release("s1") + f.advanceMillis(900) + f.cache.evictIdleNow() + assert(!f.lastBuilt.closed, "idle clock must restart at the last release") + } + + test("acquire after eviction rebuilds") { + val f = new Fixture + f.cache.acquire(spec("s1"), idleTtlMs = 1000) + f.cache.release("s1") + val first = f.lastBuilt + f.advanceMillis(2000) + f.cache.evictIdleNow() + assert(first.closed) + val r = f.cache.acquire(spec("s1"), idleTtlMs = 1000) + assert(f.buildCount.get() == 2) + assert(r ne first) + f.cache.release("s1") + } + + test("distinct scanIds get distinct entries") { + val f = new Fixture + val r1 = f.cache.acquire(spec("s1"), idleTtlMs = 1000) + val r2 = f.cache.acquire(spec("s2"), idleTtlMs = 1000) + assert(f.buildCount.get() == 2) + assert(r1 ne r2) + f.cache.release("s1") + f.cache.release("s2") + } + + test("unbalanced release throws") { + val f = new Fixture + intercept[IllegalStateException](f.cache.release("never-acquired")) + } + + test("shutdown closes everything, even entries in use") { + val f = new Fixture + f.cache.acquire(spec("s1"), idleTtlMs = 1000) + val built = f.lastBuilt + f.cache.shutdown() + assert(built.closed) + } +} From 1cffd9307e9736949701874c8999a12c696bfb1f Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 11 Jun 2026 13:32:22 +0200 Subject: [PATCH 11/22] refactor(spark): fold scan planning/execution into connector cdylib The connector now consumes a bridge's FFI_TableProvider entirely in process: FfiHelperNative.createScan widens the provider, builds the pinned-config SessionContext, applies projection and pushed proto filters, and plans once; partition streams cross back as FFI_ArrowArrayStream. This drops the re-FFI widening hop and the SQL string round-trip, and shrinks the datafusion-ffi version-lockstep surface from three cdylibs to two (bridge <-> connector). - extract datafusion-jni-common (error->exception mapping, runtime singleton, StreamingReader) shared by datafusion-jni and the connector cdylib - revert the core additions the old path needed: registerFfiTable, DataFrame.filterFromProto/toPartitionedExecution, and PartitionedExecution, restoring core's public surface to main - core jar remains a Spark dependency only for the generated datafusion.protobuf classes and typed exceptions; its cdylib no longer loads in Spark JVMs - delete DatafusionSqlBuilder and the plain-Java FfiTableProviderExample Verified: cargo and mvn suites green; pyspark demo passes legacy and shared-scan modes with pushdown, projection, and partition parity. Co-Authored-By: Claude Fable 5 --- Cargo.lock | 15 +- Cargo.toml | 1 + .../java/org/apache/datafusion/DataFrame.java | 42 --- .../datafusion/PartitionedExecution.java | 114 ------ .../org/apache/datafusion/SessionContext.java | 33 -- .../datafusion/PartitionedExecutionTest.java | 196 ---------- docs/source/contributor-guide/development.md | 4 +- examples/README.md | 4 +- examples/SPARK_INTEGRATION.md | 44 +-- examples/native/src/lib.rs | 19 +- .../examples/ExampleFfiProviderFactory.java | 4 +- .../examples/FfiTableProviderExample.java | 90 ----- .../FfiTableProviderExampleNative.java | 13 +- native-common/Cargo.toml | 35 ++ {native => native-common}/src/errors.rs | 7 +- native-common/src/lib.rs | 103 ++++++ native/Cargo.toml | 9 +- native/src/arrow.rs | 2 +- native/src/avro.rs | 2 +- native/src/cache_manager.rs | 2 +- native/src/csv.rs | 2 +- native/src/ffi_table_provider.rs | 71 ---- native/src/json.rs | 2 +- native/src/lib.rs | 109 +----- native/src/object_store.rs | 2 +- native/src/partitioned_execution.rs | 169 --------- native/src/proto.rs | 2 +- native/src/runtime_metrics.rs | 4 +- native/src/schema.rs | 2 +- spark/native/Cargo.toml | 6 + spark/native/src/lib.rs | 115 +----- spark/native/src/scan.rs | 350 ++++++++++++++++++ .../io/datafusion/spark/FfiHelperNative.java | 68 +++- .../datafusion/spark/FfiProviderFactory.java | 2 +- .../DatafusionColumnarPartitionReader.scala | 62 ++-- .../spark/DatafusionInputPartition.scala | 2 +- .../io/datafusion/spark/DatafusionScan.scala | 2 +- .../datafusion/spark/DatafusionSource.scala | 32 +- ...fusionSqlBuilder.scala => FfiStream.scala} | 35 +- .../spark/NativeSharedScanResources.scala | 56 ++- .../spark/PinnedSessionConfig.scala | 17 +- 41 files changed, 752 insertions(+), 1097 deletions(-) delete mode 100644 core/src/main/java/org/apache/datafusion/PartitionedExecution.java delete mode 100644 core/src/test/java/org/apache/datafusion/PartitionedExecutionTest.java delete mode 100644 examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExample.java create mode 100644 native-common/Cargo.toml rename {native => native-common}/src/errors.rs (97%) create mode 100644 native-common/src/lib.rs delete mode 100644 native/src/ffi_table_provider.rs delete mode 100644 native/src/partitioned_execution.rs create mode 100644 spark/native/src/scan.rs rename spark/src/main/scala/io/datafusion/spark/{DatafusionSqlBuilder.scala => FfiStream.scala} (52%) diff --git a/Cargo.lock b/Cargo.lock index a6a4204..4a7b53d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1449,7 +1449,7 @@ dependencies = [ "arrow", "async-trait", "datafusion", - "datafusion-ffi", + "datafusion-jni-common", "datafusion-proto", "datafusion-substrait", "futures", @@ -1463,6 +1463,16 @@ dependencies = [ "url", ] +[[package]] +name = "datafusion-jni-common" +version = "0.1.0" +dependencies = [ + "datafusion", + "futures", + "jni", + "tokio", +] + [[package]] name = "datafusion-macros" version = "53.1.0" @@ -1679,8 +1689,11 @@ dependencies = [ "async-trait", "datafusion", "datafusion-ffi", + "datafusion-jni-common", + "datafusion-proto", "futures", "jni", + "prost", "tokio", ] diff --git a/Cargo.toml b/Cargo.toml index d582098..c9f0f58 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,6 +19,7 @@ resolver = "2" members = [ "native", + "native-common", "examples/native", "spark/native", ] diff --git a/core/src/main/java/org/apache/datafusion/DataFrame.java b/core/src/main/java/org/apache/datafusion/DataFrame.java index 39834cc..d4e0226 100644 --- a/core/src/main/java/org/apache/datafusion/DataFrame.java +++ b/core/src/main/java/org/apache/datafusion/DataFrame.java @@ -113,23 +113,6 @@ public ArrowReader executeStream(BufferAllocator allocator) { } } - /** - * Plan this DataFrame once and return a {@link PartitionedExecution} that can stream each - * physical-plan output partition independently (and concurrently from multiple threads). - * - *

    Consumes this DataFrame with the same lifecycle rules as {@link - * #executeStream(BufferAllocator)}: the logical plan is released into the planned execution, and - * the caller owns (and must close) the returned handle. - */ - public PartitionedExecution toPartitionedExecution() { - if (nativeHandle == 0) { - throw new IllegalStateException("DataFrame is closed or already collected"); - } - long handle = nativeHandle; - nativeHandle = 0; - return new PartitionedExecution(createPartitionedExecution(handle)); - } - /** * Return the Arrow {@link Schema} of this DataFrame's output. Non-consuming: the receiver remains * usable and must still be closed independently. Schema inspection does not execute the plan. @@ -247,27 +230,6 @@ public DataFrame filter(String predicate) { return new DataFrame(filterRows(nativeHandle, predicate)); } - /** - * Apply a DataFusion-proto {@code LogicalExprNode} as a filter to this DataFrame. The bytes must - * be a serialized {@code datafusion.LogicalExprNode} (see {@code - * org.apache.datafusion.protobuf.LogicalExprNode}). Used by the Spark connector to push V2 {@code - * Predicate}s as proto-encoded expressions (sibling of {@link #filter(String)} for the structured - * wire path). - * - * @throws IllegalStateException if this context is closed. - * @throws RuntimeException if the bytes are not a valid {@code LogicalExprNode}, the expression - * references unknown columns/UDFs, or filter construction fails. - */ - public DataFrame filterFromProto(byte[] exprProtoBytes) { - if (nativeHandle == 0) { - throw new IllegalStateException("DataFrame is closed or already collected"); - } - if (exprProtoBytes == null) { - throw new IllegalArgumentException("filterFromProto exprProtoBytes must be non-null"); - } - return new DataFrame(filterFromProto(nativeHandle, exprProtoBytes)); - } - /** * Take the first {@code fetch} rows. Equivalent to {@link #limit(int, int)} with {@code skip = * 0}. The receiver remains usable and must still be closed independently. @@ -823,8 +785,6 @@ public void close() { private static native void executeStreamDataFrame(long handle, long ffiStreamAddr); - private static native long createPartitionedExecution(long handle); - private static native void closeDataFrame(long handle); private static native long countRows(long handle); @@ -845,8 +805,6 @@ public void close() { private static native long filterRows(long handle, String predicate); - private static native long filterFromProto(long handle, byte[] exprProtoBytes); - private static native long limitRows(long handle, int skip, int fetch); private static native long distinctRows(long handle); diff --git a/core/src/main/java/org/apache/datafusion/PartitionedExecution.java b/core/src/main/java/org/apache/datafusion/PartitionedExecution.java deleted file mode 100644 index 8b9ce08..0000000 --- a/core/src/main/java/org/apache/datafusion/PartitionedExecution.java +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.datafusion; - -import org.apache.arrow.c.ArrowArrayStream; -import org.apache.arrow.c.Data; -import org.apache.arrow.memory.BufferAllocator; -import org.apache.arrow.vector.ipc.ArrowReader; - -/** - * A DataFrame planned exactly once, exposing its physical plan's output partitions for individual - * streaming. Obtained via {@link DataFrame#toPartitionedExecution()}. - * - *

    Unlike {@link DataFrame#executeStream(BufferAllocator)}, which coalesces every output - * partition into one stream, this handle lets distinct threads drive distinct partitions — e.g. one - * Spark task per DataFusion partition. - * - *

    Thread safety. {@link #partitionCount()} and {@link #executeStream(int, - * BufferAllocator)} are safe to call concurrently from multiple threads on the same instance. - * Re-executing the same partition index more than once opens an independent native stream each - * time, but only succeeds when every operator in that partition's pipeline supports repeated {@code - * execute()} — stateless scans (MemTable, table providers) do; {@code RepartitionExec} pipelines - * (hash aggregates, joins) do not and fail the second stream. {@link #close()} is idempotent, but - * the caller must guarantee that no {@code executeStream} call is in flight and that all returned - * readers have been closed before calling it — the native plan is freed immediately. Consumers that - * share one instance across threads must enforce that ordering themselves (e.g. with a reference - * count). - */ -public final class PartitionedExecution implements AutoCloseable { - static { - NativeLibraryLoader.loadLibrary(); - } - - private volatile long nativeHandle; - - PartitionedExecution(long nativeHandle) { - if (nativeHandle == 0) { - throw new IllegalStateException("Failed to create native PartitionedExecution"); - } - this.nativeHandle = nativeHandle; - } - - /** Number of output partitions of the planned physical plan. */ - public int partitionCount() { - long handle = nativeHandle; - if (handle == 0) { - throw new IllegalStateException("PartitionedExecution is closed"); - } - return partitionCountNative(handle); - } - - /** - * Open an independent stream over one plan partition. Each call to {@link - * ArrowReader#loadNextBatch} drives one async {@code stream.next()} on the native side, so memory - * pressure stays bounded by the executor pipeline plus one in-flight batch. - * - *

    Non-consuming: this instance remains usable, and concurrent calls — including for the same - * partition index — are safe. The caller closes the returned reader; the supplied allocator must - * outlive it. - * - * @param partition partition index in {@code [0, partitionCount())} - * @throws RuntimeException if the index is out of range for the planned partitioning - */ - public ArrowReader executeStream(int partition, BufferAllocator allocator) { - long handle = nativeHandle; - if (handle == 0) { - throw new IllegalStateException("PartitionedExecution is closed"); - } - ArrowArrayStream stream = ArrowArrayStream.allocateNew(allocator); - try { - executeStreamPartition(handle, partition, stream.memoryAddress()); - return Data.importArrayStream(allocator, stream); - } catch (Throwable e) { - stream.close(); - throw e; - } - } - - /** - * Release the native plan. Idempotent. See the class Javadoc for the ordering contract with - * in-flight {@link #executeStream(int, BufferAllocator)} calls. - */ - @Override - public void close() { - long handle = nativeHandle; - if (handle != 0) { - nativeHandle = 0; - closePartitionedExecution(handle); - } - } - - private static native int partitionCountNative(long handle); - - private static native void executeStreamPartition(long handle, int partition, long ffiStreamAddr); - - private static native void closePartitionedExecution(long handle); -} diff --git a/core/src/main/java/org/apache/datafusion/SessionContext.java b/core/src/main/java/org/apache/datafusion/SessionContext.java index ea56d80..27d2b16 100644 --- a/core/src/main/java/org/apache/datafusion/SessionContext.java +++ b/core/src/main/java/org/apache/datafusion/SessionContext.java @@ -571,36 +571,6 @@ public void registerUdf(ScalarUdf udf) { * context is closed. * @throws RuntimeException if native registration fails. */ - /** - * Register a TableProvider produced as an {@code FFI_TableProvider} pointer by Rust code on the - * far side of the FFI boundary. - * - *

    The pointer is the raw boxed address ({@code Box::into_raw(Box::new(FFI_TableProvider))}) - * returned by another cdylib's JNI entry point — typically a domain bridge (e.g. Rerun's - * createFfiProvider) followed by the connector-core widening helper. Ownership transfers in; the - * pointer must not be reused after this call. - * - *

    Predicate pushdown and projection cross the FFI boundary as part of the standard - * datafusion-ffi protocol; no JVM-side TableProvider implementation runs. - * - * @throws IllegalStateException if this context is closed. - * @throws IllegalArgumentException if {@code name} is empty or {@code ffiTableProviderPtr} is 0. - * @throws RuntimeException if native registration fails. - */ - public void registerFfiTable(String name, long ffiTableProviderPtr) { - if (nativeHandle == 0) { - throw new IllegalStateException("SessionContext is closed"); - } - if (name == null || name.isEmpty()) { - throw new IllegalArgumentException("registerFfiTable name must be non-empty"); - } - if (ffiTableProviderPtr == 0) { - throw new IllegalArgumentException( - "registerFfiTable ffiTableProviderPtr must be a non-null FFI_TableProvider pointer"); - } - registerFfiTableNative(nativeHandle, name, ffiTableProviderPtr); - } - public void registerTable(String name, TableProvider provider) { if (nativeHandle == 0) { throw new IllegalStateException("SessionContext is closed"); @@ -695,7 +665,4 @@ private static native void registerScalarUdf( private static native void registerTableNative( long handle, String name, byte[] schemaIpcBytes, TableProvider provider); - - private static native void registerFfiTableNative( - long handle, String name, long ffiTableProviderPtr); } diff --git a/core/src/test/java/org/apache/datafusion/PartitionedExecutionTest.java b/core/src/test/java/org/apache/datafusion/PartitionedExecutionTest.java deleted file mode 100644 index 74d320a..0000000 --- a/core/src/test/java/org/apache/datafusion/PartitionedExecutionTest.java +++ /dev/null @@ -1,196 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.datafusion; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertThrows; -import static org.junit.jupiter.api.Assertions.assertTrue; - -import java.util.ArrayList; -import java.util.List; -import java.util.concurrent.Callable; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.Future; - -import org.apache.arrow.memory.BufferAllocator; -import org.apache.arrow.memory.RootAllocator; -import org.apache.arrow.vector.BigIntVector; -import org.apache.arrow.vector.ipc.ArrowReader; -import org.junit.jupiter.api.Test; - -class PartitionedExecutionTest { - - /** - * A plan whose physical form reliably keeps {@code targetPartitions} output partitions: the - * hash-repartitioned GROUP BY can't be collapsed by the physical optimizer, unlike a bare - * top-level round-robin repartition, which {@code EnforceDistribution} removes as non-beneficial. - */ - private static final String GROUPED_SQL = - "SELECT x FROM (VALUES (1), (2), (3), (4), (5), (6), (7), (8)) AS t(x) GROUP BY x"; - - private static final List EXPECTED_ROWS = List.of(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L); - - /** Drain one partition's reader into a list of long values from column 0. */ - private static List drain(ArrowReader reader) throws Exception { - List out = new ArrayList<>(); - while (reader.loadNextBatch()) { - BigIntVector v = (BigIntVector) reader.getVectorSchemaRoot().getVector(0); - for (int i = 0; i < v.getValueCount(); i++) { - out.add(v.get(i)); - } - } - return out; - } - - private static List drainPartition( - PartitionedExecution exec, int partition, BufferAllocator allocator) throws Exception { - try (ArrowReader reader = exec.executeStream(partition, allocator)) { - return drain(reader); - } - } - - @Test - void partitionCountMatchesTargetPartitions() throws Exception { - try (SessionContext ctx = SessionContext.builder().targetPartitions(4).build(); - DataFrame df = ctx.sql(GROUPED_SQL); - PartitionedExecution exec = df.toPartitionedExecution()) { - assertEquals(4, exec.partitionCount()); - } - } - - @Test - void unionOfPartitionsEqualsFullResult() throws Exception { - try (BufferAllocator allocator = new RootAllocator(); - SessionContext ctx = SessionContext.builder().targetPartitions(4).build(); - DataFrame df = ctx.sql(GROUPED_SQL); - PartitionedExecution exec = df.toPartitionedExecution()) { - assertEquals(4, exec.partitionCount()); - List all = new ArrayList<>(); - for (int p = 0; p < exec.partitionCount(); p++) { - all.addAll(drainPartition(exec, p, allocator)); - } - all.sort(Long::compare); - assertEquals(EXPECTED_ROWS, all); - } - } - - @Test - void concurrentPartitionStreamsAreIndependent() throws Exception { - try (BufferAllocator allocator = new RootAllocator(); - SessionContext ctx = SessionContext.builder().targetPartitions(4).build(); - DataFrame df = ctx.sql(GROUPED_SQL); - PartitionedExecution exec = df.toPartitionedExecution()) { - int n = exec.partitionCount(); - assertEquals(4, n); - ExecutorService pool = Executors.newFixedThreadPool(n); - try { - List>> jobs = new ArrayList<>(); - for (int p = 0; p < n; p++) { - final int partition = p; - jobs.add(() -> drainPartition(exec, partition, allocator)); - } - List all = new ArrayList<>(); - for (Future> f : pool.invokeAll(jobs)) { - all.addAll(f.get()); - } - all.sort(Long::compare); - assertEquals(EXPECTED_ROWS, all); - } finally { - pool.shutdownNow(); - } - } - } - - @Test - void samePartitionCanBeStreamedTwiceOnStatelessScans() throws Exception { - // Spark task retry / speculative execution re-executes a partition index. - // Re-execution is only supported by plans whose partitions are stateless - // scans (MemoryExec, table providers): a UNION ALL of two VALUES keeps one - // re-executable MemoryExec partition per branch. Pipelines containing - // RepartitionExec (e.g. a hash GROUP BY) panic on second execute -- its - // per-partition channels are single-use -- which is why this test does not - // reuse GROUPED_SQL. - String unionSql = - "SELECT * FROM (VALUES (1), (2)) AS t(x) UNION ALL SELECT * FROM (VALUES (3), (4)) AS t(x)"; - try (BufferAllocator allocator = new RootAllocator(); - SessionContext ctx = new SessionContext(); - DataFrame df = ctx.sql(unionSql); - PartitionedExecution exec = df.toPartitionedExecution()) { - assertEquals(2, exec.partitionCount()); - List firstTotal = new ArrayList<>(); - List secondTotal = new ArrayList<>(); - for (int p = 0; p < exec.partitionCount(); p++) { - firstTotal.addAll(drainPartition(exec, p, allocator)); - secondTotal.addAll(drainPartition(exec, p, allocator)); - } - firstTotal.sort(Long::compare); - secondTotal.sort(Long::compare); - assertEquals(List.of(1L, 2L, 3L, 4L), firstTotal); - assertEquals(firstTotal, secondTotal); - } - } - - @Test - void toPartitionedExecutionConsumesTheDataFrame() throws Exception { - try (BufferAllocator allocator = new RootAllocator(); - SessionContext ctx = new SessionContext()) { - DataFrame df = ctx.sql("SELECT 1"); - try (PartitionedExecution exec = df.toPartitionedExecution()) { - assertTrue(exec.partitionCount() >= 1); - } - assertThrows(IllegalStateException.class, () -> df.executeStream(allocator)); - assertThrows(IllegalStateException.class, df::toPartitionedExecution); - // close() on a consumed DataFrame stays a no-op (no double-free). - df.close(); - } - } - - @Test - void closeIsIdempotentAndBlocksFurtherUse() throws Exception { - try (BufferAllocator allocator = new RootAllocator(); - SessionContext ctx = new SessionContext(); - DataFrame df = ctx.sql("SELECT 1")) { - PartitionedExecution exec = df.toPartitionedExecution(); - exec.close(); - exec.close(); - assertThrows(IllegalStateException.class, exec::partitionCount); - assertThrows(IllegalStateException.class, () -> exec.executeStream(0, allocator)); - } - } - - @Test - void outOfRangePartitionThrowsClearError() throws Exception { - try (BufferAllocator allocator = new RootAllocator(); - SessionContext ctx = SessionContext.builder().targetPartitions(2).build(); - DataFrame df = ctx.sql(GROUPED_SQL); - PartitionedExecution exec = df.toPartitionedExecution()) { - assertEquals(2, exec.partitionCount()); - RuntimeException e = - assertThrows(RuntimeException.class, () -> exec.executeStream(7, allocator)); - assertTrue( - e.getMessage().contains("out of range"), - "expected out-of-range message, got: " + e.getMessage()); - assertThrows(RuntimeException.class, () -> exec.executeStream(-1, allocator)); - // The handle survives a failed executeStream call. - assertEquals(2, exec.partitionCount()); - } - } -} diff --git a/docs/source/contributor-guide/development.md b/docs/source/contributor-guide/development.md index 9eba9a5..cb80276 100644 --- a/docs/source/contributor-guide/development.md +++ b/docs/source/contributor-guide/development.md @@ -88,7 +88,9 @@ The repository is a multi-module Maven build: - `examples/` — `datafusion-java-examples` module containing runnable examples that depend on the library; built alongside the library so they cannot fall out of sync with the API. Includes `examples/native/`, a - small FFI table-provider cdylib used by `FfiTableProviderExample`. + small FFI table-provider cdylib used by the Spark connector demo + (`ExampleFfiProviderFactory` + the pyspark script under + `examples/python/`). - `native/` — `datafusion-jni` Rust crate (JNI + Arrow C Data Interface). - `proto/` — Protobuf definitions shared between Java and Rust. - `Makefile` — top-level build orchestration (`make test`, `make format`, diff --git a/examples/README.md b/examples/README.md index d2c2e5b..127c209 100644 --- a/examples/README.md +++ b/examples/README.md @@ -32,11 +32,11 @@ add `-Dmaven.repo.local=/path/to/repo` to BOTH invocations.) | `JdbcExample` | Pull from an H2 JDBC source into Arrow, register it, query. | | `AddOneExample` | Implement a Scalar UDF in Java and register it on the session. | | `NestedTypeUdfExample` | Scalar UDF over `List` — input + output nested arrow types. | -| `FfiTableProviderExample` | Build an `FFI_TableProvider` in Rust (a `MemTable`), hand the raw pointer to the JVM, register it via `SessionContext.registerFfiTable`, run SQL. **See also: [SPARK_INTEGRATION.md](SPARK_INTEGRATION.md).** | +| `ExampleFfiProviderFactory` | Build an `FFI_TableProvider` in Rust (a `MemTable`) and expose it to Spark through the connector's `FfiProviderFactory` interface. **See: [SPARK_INTEGRATION.md](SPARK_INTEGRATION.md) and the pyspark demo under [`python/`](python/).** | ## Building the FFI example's cdylib -The `FfiTableProviderExample` relies on a small Rust cdylib under +The FFI provider examples rely on a small Rust cdylib under [`native/`](native/). It is a member of the repo-root Cargo workspace, so build it by name from anywhere in the tree: diff --git a/examples/SPARK_INTEGRATION.md b/examples/SPARK_INTEGRATION.md index e290b79..a658e4d 100644 --- a/examples/SPARK_INTEGRATION.md +++ b/examples/SPARK_INTEGRATION.md @@ -1,12 +1,12 @@ # Using an FFI TableProvider as a Spark Data Source -The [`FfiTableProviderExample`](src/main/java/org/apache/datafusion/examples/FfiTableProviderExample.java) -shows the JVM side of the FFI handover: Rust builds an `FFI_TableProvider`, -hands the raw pointer to the JVM, and the JVM calls -`SessionContext.registerFfiTable(name, ptr)` to make it queryable through -DataFusion-Java. +The FFI handover is simple: Rust builds an `FFI_TableProvider`, hands the raw +pointer to the JVM, and the JVM passes it to the connector cdylib +(`FfiHelperNative.createScan`), which does everything DataFusion-side in +process — widening, session construction, projection, pushed filters, +planning, and partition streams. -That same flow plugs into Apache Spark as a DataSource V2 by way of the +That flow plugs into Apache Spark as a DataSource V2 by way of the [`connector-core`](https://github.com/rerun-io/rerun-spark-connector) module (generic Spark plumbing donated upstream-ready). Below is the recipe for wiring a domain bridge — e.g. an in-house format or a custom catalog — into @@ -25,27 +25,23 @@ Spark via this pattern. | v +--------------------------+ +------------------------------+ -| connector-core cdylib | jlong (wide) | connector-core JVM | +| connector-core cdylib | jlong ptr | connector-core JVM | | - WideningTableProvider | <------------- | - DatafusionSource (DSv2) | | over arrow::cast | | - SparkPredicateTranslator | -+--------------------------+ | - ColumnarPartitionReader | - +------------------------------+ - | - v - +------------------------------+ - | datafusion-java | - | - SessionContext | - | - registerFfiTable(name,ptr)| - | - DataFrame.filterFromProto | - +------------------------------+ +| - createScan: session, | FFI_Arrow- | - ColumnarPartitionReader | +| projection, filters, | ArrayStream | - SharedScanCache | +| plan, exec partitions | -------------> | | ++--------------------------+ +------------------------------+ ``` Key invariants: -- Only the opaque `FFI_TableProvider` pointer crosses the cdylib boundary. - No `SessionContext` is ever shared. -- The widening cdylib (connector-core) sits between your bridge and - `registerFfiTable`. It casts Spark-incompatible Arrow types (UInt*, Float16, +- Only the opaque `FFI_TableProvider` pointer crosses the cdylib boundary + (and `FFI_ArrowArrayStream` on the way back). No `SessionContext` is ever + shared, and none exists JVM-side — planning and execution live entirely in + the connector cdylib. +- The connector cdylib widens between your bridge's provider and the scan: + it casts Spark-incompatible Arrow types (UInt*, Float16, Time*, non-µs Timestamp, recursive List/LargeList/FixedSizeList) using kernel-level `arrow::compute::cast`. No SQL, no view rewrites. - Predicate pushdown crosses the FFI boundary as a `LogicalExprNode` proto @@ -177,12 +173,12 @@ df = (spark.read.format("my_format") | Phase | Where | Path | | --------------------------- | --------- | ---- | -| `inferSchema` | Driver | `factory.encodeOptions` → `factory.createProvider(opts, EMPTY)` → widen → `registerFfiTable` → `ctx.tableSchema` | +| `inferSchema` | Driver | `factory.encodeOptions` → `factory.createProvider(opts, EMPTY)` → `FfiHelperNative.providerSchemaIpc` (widens, returns Arrow IPC schema) | | `ScanBuilder.build` | Driver | `factory.listPartitions(optionsBytes, filterBytes)` (filter-aware overload — bridges can prune partitions; cached on Scan) + `factory.reportPartitioning(optionsBytes)` (cached on Scan) | | `outputPartitioning` | Driver | `KeyGroupedPartitioning(reported.keys, partitions.length)` when bridge declared one; `UnknownPartitioning(partitions.length)` otherwise. Spark may elide shuffles when keys line up with downstream join/agg grouping. | | `planInputPartitions` | Driver | Reuses the cached `PartitionInfo[]`; one task per entry with that entry's `partitionBytes` + `preferredLocations` | | Predicate translation | Driver | `SparkPredicateTranslator.translate(Predicate)` → `LogicalExprNode` proto bytes (each pushed predicate is independent) | -| Per-task scan | Executor | Same factory → `createProvider(opts, partitionBytes)` → widen → `registerFfiTable` → `ctx.sql("SELECT proj FROM t")` → fold `DataFrame.filterFromProto(bytes)` over pushed predicates → `executeStream` | +| Per-task scan | Executor | Same factory → `createProvider(opts, partitionBytes)` → `FfiHelperNative.createScan` (widen, projection, pushed proto filters, plan) → `executeStream` | ## Partition key values (`HasPartitionKey`) @@ -229,7 +225,7 @@ What changes: | ---------------------- | -------- | ---- | | `ScanBuilder.build` | Driver | mint `scanId` (UUID) + pin session config → probe build (same code path as executors) → physical plan partition count `N` → `N` tasks | | `outputPartitioning` | Driver | always `UnknownPartitioning(N)` — DataFusion partitions carry no key contract; `listPartitions` / `reportPartitioning` are not called | -| Per-task scan | Executor | `SharedScanCache.acquire(scanId)` → (first task only) `createProvider(opts, EMPTY)` → widen → `registerFfiTable` on a pinned-config `SessionContext` → SQL + filters → plan once → every task `executeStream(partitionIndex)` → release | +| Per-task scan | Executor | `SharedScanCache.acquire(scanId)` → (first task only) `createProvider(opts, EMPTY)` → `FfiHelperNative.createScan` with the pinned config (widen, projection, filters, plan once) → every task `executeStreamPartition(partitionIndex)` → release | Cache semantics: entries are keyed by `scanId` (per query — separate actions build separate entries), refcounted by open readers, and evicted after an idle diff --git a/examples/native/src/lib.rs b/examples/native/src/lib.rs index e75a37d..8b861bb 100644 --- a/examples/native/src/lib.rs +++ b/examples/native/src/lib.rs @@ -17,12 +17,13 @@ //! Example cdylib that produces a small DataFusion `MemTable` wrapped as an //! `FFI_TableProvider`, returned to the JVM as a `jlong` (the raw boxed -//! pointer). The JVM example uses `SessionContext.registerFfiTable(name, ptr)` -//! to install the provider on a DataFusion session and runs SQL against it. +//! pointer). The Spark connector consumes the pointer via +//! `FfiHelperNative.createScan` / `providerSchemaIpc`, which widen the +//! provider and plan/execute the scan inside the connector cdylib. //! //! The same pattern is what domain bridges (Rerun, HDF5, custom Iceberg) use -//! to expose their TableProviders to DataFusion-Java — and, transitively, to -//! Spark via the connector-core DataSource V2 plumbing. +//! to expose their TableProviders to Spark via the connector-core DataSource +//! V2 plumbing. //! //! ## Options wire format //! @@ -186,8 +187,8 @@ fn build_mem_table( /// JNI entry point: decode the options blob, build a `MemTable` accordingly, /// wrap it in an `FFI_TableProvider`, return the raw boxed pointer as a `jlong`. /// Ownership of the boxed FFI transfers to the caller — the matching -/// `Box::from_raw` is performed by `SessionContext.registerFfiTable` on the -/// consumer side. +/// `Box::from_raw` is performed by the consumer (the Spark connector's +/// `FfiHelperNative.createScan` / `providerSchemaIpc`). #[no_mangle] pub extern "system" fn Java_org_apache_datafusion_examples_FfiTableProviderExampleNative_createMemTableProvider< 'local, @@ -231,9 +232,9 @@ pub extern "system" fn Java_org_apache_datafusion_examples_FfiTableProviderExamp } /// Drop a previously-created FFI_TableProvider whose pointer was NOT handed -/// off to `registerFfiTable`. Exposed for symmetry — callers that pass the -/// pointer to `registerFfiTable` must NOT also call this; ownership has -/// already transferred. +/// off to a consumer. Exposed for the error path — callers that pass the +/// pointer to `createScan` / `providerSchemaIpc` must NOT also call this; +/// ownership has already transferred. #[no_mangle] pub extern "system" fn Java_org_apache_datafusion_examples_FfiTableProviderExampleNative_dropProvider< 'local, diff --git a/examples/src/main/java/org/apache/datafusion/examples/ExampleFfiProviderFactory.java b/examples/src/main/java/org/apache/datafusion/examples/ExampleFfiProviderFactory.java index b1b90c2..561544f 100644 --- a/examples/src/main/java/org/apache/datafusion/examples/ExampleFfiProviderFactory.java +++ b/examples/src/main/java/org/apache/datafusion/examples/ExampleFfiProviderFactory.java @@ -73,8 +73,8 @@ *

    In the default mode a single partition (id {@code "p0"}, empty {@code partitionBytes}, no * preferred host) is reported so Spark spawns one task; the executor calls {@link * #createProvider(byte[], byte[])} to obtain a fresh {@code FFI_TableProvider} pointer, hands it to - * {@link org.apache.datafusion.SessionContext#registerFfiTable(String, long)}, and streams the - * resulting Arrow record batches back into the Spark scan. + * {@code FfiHelperNative.createScan}, and streams the resulting Arrow record batches back into the + * Spark scan. */ public final class ExampleFfiProviderFactory implements FfiProviderFactory { diff --git a/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExample.java b/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExample.java deleted file mode 100644 index baa5dae..0000000 --- a/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExample.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.datafusion.examples; - -import org.apache.arrow.memory.RootAllocator; -import org.apache.arrow.vector.VectorSchemaRoot; -import org.apache.arrow.vector.ipc.ArrowReader; -import org.apache.datafusion.DataFrame; -import org.apache.datafusion.SessionContext; - -/** - * Register a Rust-built {@code FFI_TableProvider} on a {@link SessionContext} and run SQL against - * it. - * - *

    The provider here wraps a tiny in-memory table (4 rows, 3 columns) built by the example cdylib - * under {@code examples/native}. The same {@link SessionContext#registerFfiTable(String, long)} - * entry point is what domain bridges (Rerun, HDF5, custom Iceberg) use to expose their native - * {@code TableProvider}s — and, transitively, what the Spark connector uses through the {@code - * FfiProviderFactory} interface in {@code connector-core} (see {@code - * examples/SPARK_INTEGRATION.md}). - * - *

    How to run (from the fork repo root): - * - *

    {@code
    - * cargo build -p datafusion-java-ffi-example --release
    - * mvn -B install -DskipTests -Drat.skip=true \
    - *     -Ddatafusion.native.profile=release
    - * mvn -B -pl examples exec:exec \
    - *     -Dexec.mainClass=org.apache.datafusion.examples.FfiTableProviderExample
    - * }
    - * - *

    The first {@code mvn install} step publishes {@code datafusion-java} to your local Maven repo - * so the separate {@code exec:exec} invocation can resolve it as a dependency. Skipping straight to - * {@code exec:exec} after a {@code package} build fails with {@code Could not find artifact - * org.apache.datafusion:datafusion-java:...}. - */ -public final class FfiTableProviderExample { - - private FfiTableProviderExample() {} - - public static void main(String[] args) throws Exception { - // Build the FFI provider on the Rust side. The returned `long` is a - // `Box::into_raw(Box::new(FFI_TableProvider))` pointer; ownership flows - // through `registerFfiTable` into the SessionContext. Empty options bytes - // pick the native defaults (name_prefix="row", num_rows=4, num_batches=1). - long ffiProviderPtr = FfiTableProviderExampleNative.createMemTableProvider(new byte[0]); - if (ffiProviderPtr == 0) { - throw new IllegalStateException("Native FFI provider builder returned 0"); - } - - try (var allocator = new RootAllocator(); - var ctx = new SessionContext()) { - - // Hand the raw pointer to DataFusion. After this call, the SessionContext - // owns the boxed FFI_TableProvider; do NOT call dropProvider afterwards. - ctx.registerFfiTable("example_mem", ffiProviderPtr); - - // Filter pushdown crosses the FFI boundary transparently — DataFusion's - // optimizer rewrites the predicate into a TableProviderFilterPushDown - // call on the foreign provider, which a MemTable handles unsupported - // (the executor re-applies it above the scan). - try (DataFrame df = - ctx.sql("SELECT id, name, value FROM example_mem WHERE id > 1 ORDER BY id"); - ArrowReader reader = df.collect(allocator)) { - System.out.println("Result rows:"); - while (reader.loadNextBatch()) { - VectorSchemaRoot batch = reader.getVectorSchemaRoot(); - System.out.print(batch.contentToTSVString()); - } - } - } - } -} diff --git a/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExampleNative.java b/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExampleNative.java index 612fd8d..dc0cdda 100644 --- a/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExampleNative.java +++ b/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExampleNative.java @@ -26,9 +26,8 @@ /** * JNI bindings into the example cdylib at {@code examples/native}. The cdylib produces a small - * {@code MemTable}-backed {@code FFI_TableProvider} that the JVM example registers on a {@link - * org.apache.datafusion.SessionContext} via {@link - * org.apache.datafusion.SessionContext#registerFfiTable(String, long)}. + * {@code MemTable}-backed {@code FFI_TableProvider} that {@link ExampleFfiProviderFactory} hands to + * the Spark connector ({@code FfiHelperNative.createScan}). * *

    The library is located in this order: * @@ -56,7 +55,7 @@ private FfiTableProviderExampleNative() {} /** * Build a {@code MemTable} on the Rust side, wrap it in an {@code FFI_TableProvider}, and return * the raw boxed pointer as a {@code long}. Ownership transfers to the caller; passing the pointer - * to {@link org.apache.datafusion.SessionContext#registerFfiTable(String, long)} discharges it. + * to a consumer such as {@code FfiHelperNative.createScan} discharges it. * *

    {@code optionsBytes} is the length-prefixed binary blob produced by {@link * ExampleFfiProviderFactory#encodeOptions(java.util.Map)}. An empty or {@code null} array decodes @@ -65,9 +64,9 @@ private FfiTableProviderExampleNative() {} static native long createMemTableProvider(byte[] optionsBytes); /** - * Drop an FFI_TableProvider pointer that was NEVER handed to {@code - * SessionContext.registerFfiTable}. Call this only on the error path before registration; once - * {@code registerFfiTable} accepts the pointer it owns the box. + * Drop an FFI_TableProvider pointer that was NEVER handed to a consumer. Call this only on the + * error path before handover; once {@code FfiHelperNative.createScan} (or {@code + * providerSchemaIpc}) accepts the pointer it owns the box. */ static native void dropProvider(long ffiTableProviderPtr); diff --git a/native-common/Cargo.toml b/native-common/Cargo.toml new file mode 100644 index 0000000..0a797b4 --- /dev/null +++ b/native-common/Cargo.toml @@ -0,0 +1,35 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "datafusion-jni-common" +version = "0.1.0" +edition = "2021" +publish = false + +[features] +# `datafusion-jni` builds DataFusion with `avro`, which adds the +# `DataFusionError::AvroError` variant our classifier maps to IoException. +# Feature-forwarded so consumers that don't read Avro (the Spark helper) +# don't pull the apache-avro stack into their cdylib. +avro = ["datafusion/avro"] + +[dependencies] +datafusion = { workspace = true } +futures = { workspace = true } +jni = { workspace = true } +tokio = { workspace = true } diff --git a/native/src/errors.rs b/native-common/src/errors.rs similarity index 97% rename from native/src/errors.rs rename to native-common/src/errors.rs index d926544..caa2540 100644 --- a/native/src/errors.rs +++ b/native-common/src/errors.rs @@ -96,8 +96,11 @@ fn classify(err: &DataFusionError) -> &'static str { } DataFusionError::IoError(_) | DataFusionError::ObjectStore(_) - | DataFusionError::ParquetError(_) - | DataFusionError::AvroError(_) => "org/apache/datafusion/IoException", + | DataFusionError::ParquetError(_) => "org/apache/datafusion/IoException", + // The AvroError variant only exists when DataFusion is built with its + // `avro` feature, forwarded by this crate's own `avro` feature. + #[cfg(feature = "avro")] + DataFusionError::AvroError(_) => "org/apache/datafusion/IoException", // ArrowError is a 21-variant grab bag -- only some of those variants // are actually IO-shaped. DivideByZero / ArithmeticOverflow / Compute // / Cast / InvalidArgument / Memory etc. are execution-time failures diff --git a/native-common/src/lib.rs b/native-common/src/lib.rs new file mode 100644 index 0000000..01227fd --- /dev/null +++ b/native-common/src/lib.rs @@ -0,0 +1,103 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! JNI plumbing shared by this workspace's cdylibs (`datafusion-jni` and the +//! Spark connector helper): the error-to-Java-exception mapping, the +//! per-cdylib Tokio runtime singleton, and the async-stream-to- +//! `FFI_ArrowArrayStream` bridge. +//! +//! Each cdylib statically links its own copy of this rlib, so [`runtime`] is +//! a per-cdylib singleton -- exactly the behaviour each crate had when this +//! code lived inline. Nothing here is exported with `#[no_mangle]`, so +//! linking this crate into several cdylibs loaded in one JVM cannot collide. + +pub mod errors; + +use std::panic::{catch_unwind, AssertUnwindSafe}; +use std::sync::OnceLock; + +use datafusion::arrow::array::RecordBatch; +use datafusion::arrow::datatypes::SchemaRef; +use datafusion::arrow::error::ArrowError; +use datafusion::arrow::record_batch::RecordBatchReader; +use datafusion::execution::SendableRecordBatchStream; +use futures::StreamExt; +use tokio::runtime::{Handle, Runtime}; + +static RT: OnceLock = OnceLock::new(); + +/// The cdylib-wide Tokio runtime. +pub fn runtime() -> &'static Runtime { + runtime_with_init(|_| {}) +} + +/// Same singleton as [`runtime`], with a hook that runs exactly once, when +/// the runtime is created. `datafusion-jni` uses it to install its +/// runtime-metrics accumulator so the sampling baseline coincides with +/// runtime start; every later call (either entry point) returns the existing +/// runtime without invoking the hook. +pub fn runtime_with_init(init: impl FnOnce(&Handle)) -> &'static Runtime { + RT.get_or_init(|| { + let rt = Runtime::new().expect("failed to create Tokio runtime"); + init(rt.handle()); + rt + }) +} + +/// Bridges DataFusion's async [`SendableRecordBatchStream`] to the synchronous +/// [`RecordBatchReader`] interface that `FFI_ArrowArrayStream` (and therefore +/// the Java `ArrowReader`) consumes. Each call to `next()` drives one +/// `runtime().block_on(stream.next())`, so memory pressure stays bounded by the +/// executor pipeline plus a single in-flight batch. +pub struct StreamingReader { + pub schema: SchemaRef, + pub stream: SendableRecordBatchStream, +} + +impl Iterator for StreamingReader { + type Item = Result; + + fn next(&mut self) -> Option { + // Arrow's C ABI invokes this iterator through FFI_ArrowArrayStream's + // vtable, outside the JNI handler's try_unwrap_or_throw guard. A panic + // here (buggy UDF, arrow cast that panics, runtime poison) would + // unwind across C/FFI -- undefined behaviour. Catch it and surface as + // an ArrowError so the Java side sees a normal exception instead. + let next = catch_unwind(AssertUnwindSafe(|| runtime().block_on(self.stream.next()))); + match next { + Ok(item) => item.map(|r| r.map_err(|e| ArrowError::ExternalError(Box::new(e)))), + Err(panic) => { + let msg = if let Some(s) = panic.downcast_ref::() { + s.clone() + } else if let Some(s) = panic.downcast_ref::<&str>() { + (*s).to_string() + } else { + "rust panic with non-string payload".to_string() + }; + Some(Err(ArrowError::ExternalError( + format!("panic in DataFrame stream: {msg}").into(), + ))) + } + } + } +} + +impl RecordBatchReader for StreamingReader { + fn schema(&self) -> SchemaRef { + self.schema.clone() + } +} diff --git a/native/Cargo.toml b/native/Cargo.toml index aa56ca6..0f4ca83 100644 --- a/native/Cargo.toml +++ b/native/Cargo.toml @@ -23,8 +23,8 @@ publish = false [lib] # `rlib` alongside `cdylib` so `cargo test` has a Rust-level harness for -# native-only invariants (e.g. error-classification routing through wrapped -# DataFusionError chains). The `cdylib` is still the artifact the JVM loads. +# native-only invariants (the error-classification tests now live in +# `datafusion-jni-common`). The `cdylib` is still the artifact the JVM loads. crate-type = ["cdylib", "rlib"] [features] @@ -72,7 +72,10 @@ runtime-metrics = ["dep:tokio-metrics"] arrow = { workspace = true } async-trait = { workspace = true } datafusion = { workspace = true, features = ["avro"] } -datafusion-ffi = { workspace = true } +# Shared JNI plumbing (error->exception mapping, runtime singleton, +# StreamingReader). `avro` keeps the classifier's AvroError->IoException arm +# in sync with the `avro` feature on `datafusion` above. +datafusion-jni-common = { path = "../native-common", features = ["avro"] } datafusion-proto = { workspace = true } datafusion-substrait = { workspace = true, optional = true } futures = { workspace = true } diff --git a/native/src/arrow.rs b/native/src/arrow.rs index 2bbe7b0..67e5caf 100644 --- a/native/src/arrow.rs +++ b/native/src/arrow.rs @@ -23,10 +23,10 @@ use jni::sys::jlong; use jni::JNIEnv; use prost::Message; -use crate::errors::{try_unwrap_or_throw, JniResult}; use crate::proto_gen::ArrowReadOptionsProto; use crate::runtime; use crate::schema::decode_optional_schema; +use datafusion_jni_common::errors::{try_unwrap_or_throw, JniResult}; fn with_arrow_options( env: &mut JNIEnv, diff --git a/native/src/avro.rs b/native/src/avro.rs index 85d4a07..257ae32 100644 --- a/native/src/avro.rs +++ b/native/src/avro.rs @@ -23,10 +23,10 @@ use jni::sys::jlong; use jni::JNIEnv; use prost::Message; -use crate::errors::{try_unwrap_or_throw, JniResult}; use crate::proto_gen::AvroReadOptionsProto; use crate::runtime; use crate::schema::decode_optional_schema; +use datafusion_jni_common::errors::{try_unwrap_or_throw, JniResult}; fn with_avro_options( env: &mut JNIEnv, diff --git a/native/src/cache_manager.rs b/native/src/cache_manager.rs index 3b9e286..ec38dc8 100644 --- a/native/src/cache_manager.rs +++ b/native/src/cache_manager.rs @@ -34,8 +34,8 @@ use datafusion::execution::cache::cache_unit::{ }; use datafusion::execution::cache::DefaultListFilesCache; -use crate::errors::JniResult; use crate::proto_gen::CacheManagerOptionsProto; +use datafusion_jni_common::errors::JniResult; /// Build a [`CacheManagerConfig`] from the proto. Returns `Ok(None)` if the /// caller did not set any cache-manager field, so the JNI layer can skip the diff --git a/native/src/csv.rs b/native/src/csv.rs index 3ae4627..b79ed59 100644 --- a/native/src/csv.rs +++ b/native/src/csv.rs @@ -26,12 +26,12 @@ use jni::sys::jlong; use jni::JNIEnv; use prost::Message; -use crate::errors::{try_unwrap_or_throw, JniResult}; use crate::proto_gen::{ CsvReadOptionsProto, CsvWriteOptionsProto, FileCompressionType as ProtoFileCompressionType, }; use crate::runtime; use crate::schema::decode_optional_schema; +use datafusion_jni_common::errors::{try_unwrap_or_throw, JniResult}; fn with_csv_options( env: &mut JNIEnv, diff --git a/native/src/ffi_table_provider.rs b/native/src/ffi_table_provider.rs deleted file mode 100644 index f055263..0000000 --- a/native/src/ffi_table_provider.rs +++ /dev/null @@ -1,71 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Generic FFI-bridged TableProvider registration. -//! -//! Accepts a raw `FFI_TableProvider` pointer produced elsewhere — by another -//! cdylib (cross-binary boundary; transparently wrapped via -//! `ForeignTableProvider`) or by Rust code in this same crate (same-binary; -//! library marker lets the impl unwrap to the original Arc). -//! -//! Ownership: the caller's `Box::into_raw(Box::new(FFI_TableProvider))` -//! pointer is consumed here. After this call the pointer must not be reused. - -use std::sync::Arc; - -use datafusion::catalog::TableProvider; -use datafusion::prelude::SessionContext; -use datafusion_ffi::table_provider::FFI_TableProvider; -use jni::objects::{JClass, JString}; -use jni::sys::jlong; -use jni::JNIEnv; - -use crate::errors::{try_unwrap_or_throw, JniResult}; - -#[no_mangle] -pub extern "system" fn Java_org_apache_datafusion_SessionContext_registerFfiTableNative<'local>( - mut env: JNIEnv<'local>, - _class: JClass<'local>, - handle: jlong, - name: JString<'local>, - ffi_ptr: jlong, -) { - try_unwrap_or_throw(&mut env, (), |env| -> JniResult<()> { - if handle == 0 { - return Err("SessionContext handle is null".into()); - } - if ffi_ptr == 0 { - return Err("registerFfiTable: FFI_TableProvider pointer is null".into()); - } - // SAFETY: matches the existing `registerTableNative` pattern — handle - // came from `createSessionContext` as `Box` raw ptr. - let ctx = unsafe { &*(handle as *const SessionContext) }; - let name: String = env.get_string(&name)?.into(); - - // Take ownership of the producer's FFI_TableProvider, materialise an - // Arc on this side (cross-cdylib hop returns a - // ForeignTableProvider wrapper; same-cdylib hop returns the original - // Arc thanks to LIBRARY_MARKER dispatch in datafusion-ffi), then drop - // the Box — the Arc clone now retains ownership. - let ffi = unsafe { Box::from_raw(ffi_ptr as *mut FFI_TableProvider) }; - let provider: Arc = (&*ffi).into(); - drop(ffi); - - ctx.register_table(name.as_str(), provider)?; - Ok(()) - }) -} diff --git a/native/src/json.rs b/native/src/json.rs index 8eea32f..b87be78 100644 --- a/native/src/json.rs +++ b/native/src/json.rs @@ -27,12 +27,12 @@ use jni::sys::jlong; use jni::JNIEnv; use prost::Message; -use crate::errors::{try_unwrap_or_throw, JniResult}; use crate::proto_gen::{ FileCompressionType as ProtoFileCompressionType, JsonWriteOptionsProto, NdJsonReadOptionsProto, }; use crate::runtime; use crate::schema::decode_optional_schema; +use datafusion_jni_common::errors::{try_unwrap_or_throw, JniResult}; fn with_json_options( env: &mut JNIEnv, diff --git a/native/src/lib.rs b/native/src/lib.rs index bab6477..6fb1eb1 100644 --- a/native/src/lib.rs +++ b/native/src/lib.rs @@ -19,13 +19,10 @@ mod arrow; mod avro; mod cache_manager; mod csv; -mod errors; -mod ffi_table_provider; mod jni_util; mod json; mod memory; mod object_store; -mod partitioned_execution; mod proto; mod runtime_metrics; mod schema; @@ -36,16 +33,13 @@ pub(crate) mod proto_gen { include!(concat!(env!("OUT_DIR"), "/datafusion_java.rs")); } -use std::panic::{catch_unwind, AssertUnwindSafe}; use std::path::PathBuf; use std::sync::{Arc, OnceLock}; -use datafusion::arrow::array::RecordBatch; use datafusion::arrow::datatypes::SchemaRef; -use datafusion::arrow::error::ArrowError; use datafusion::arrow::ffi_stream::FFI_ArrowArrayStream; use datafusion::arrow::ipc::writer::StreamWriter; -use datafusion::arrow::record_batch::{RecordBatchIterator, RecordBatchReader}; +use datafusion::arrow::record_batch::RecordBatchIterator; use datafusion::common::{JoinType, UnnestOptions}; use datafusion::config::TableParquetOptions; use datafusion::dataframe::DataFrame; @@ -53,11 +47,9 @@ use datafusion::dataframe::DataFrameWriteOptions; use datafusion::error::DataFusionError; use datafusion::execution::disk_manager::{DiskManagerBuilder, DiskManagerMode}; use datafusion::execution::runtime_env::RuntimeEnvBuilder; -use datafusion::execution::SendableRecordBatchStream; use datafusion::logical_expr::Expr; use datafusion::logical_expr::{col, Partitioning, ScalarUDF, Signature, SortExpr}; use datafusion::prelude::{ParquetReadOptions, SessionConfig, SessionContext}; -use futures::StreamExt; use jni::objects::{JBooleanArray, JByteArray, JClass, JObject, JObjectArray, JString}; use jni::sys::{jboolean, jbyte, jbyteArray, jint, jlong}; use jni::JNIEnv; @@ -65,7 +57,10 @@ use jni::JavaVM; use prost::Message; use tokio::runtime::Runtime; -use crate::errors::{try_unwrap_or_throw, JniResult}; +use datafusion_jni_common::errors::{try_unwrap_or_throw, JniResult}; +// Re-exported so sibling modules keep their crate-local `crate::StreamingReader` path. +pub(crate) use datafusion_jni_common::StreamingReader; + use crate::proto_gen::ParquetReadOptionsProto; use crate::proto_gen::SessionOptions; use crate::schema::decode_optional_schema; @@ -86,18 +81,15 @@ pub(crate) fn jvm() -> &'static JavaVM { } pub(crate) fn runtime() -> &'static Runtime { - static RT: OnceLock = OnceLock::new(); - RT.get_or_init(|| { - let rt = Runtime::new().expect("failed to create Tokio runtime"); - // Eagerly install the runtime-metrics accumulator (no-op when the - // `runtime-metrics` Cargo feature is off). Initialising here -- not - // lazily on the first `runtimeStats()` call -- means the - // RuntimeMonitor's sampling baseline coincides with runtime start, so - // poll/park/busy totals reflect activity from the first query onward - // rather than from the first observation. - crate::runtime_metrics::init(rt.handle()); - rt - }) + // The singleton itself lives in datafusion-jni-common (shared with the + // Spark helper cdylib; each cdylib statically links its own copy, so the + // runtime stays per-library). The init hook eagerly installs the + // runtime-metrics accumulator (no-op when the `runtime-metrics` Cargo + // feature is off). Initialising here -- not lazily on the first + // `runtimeStats()` call -- means the RuntimeMonitor's sampling baseline + // coincides with runtime start, so poll/park/busy totals reflect activity + // from the first query onward rather than from the first observation. + datafusion_jni_common::runtime_with_init(crate::runtime_metrics::init) } /// Wrap the (already-built) `RuntimeEnvBuilder`'s memory pool with a @@ -291,50 +283,6 @@ pub extern "system" fn Java_org_apache_datafusion_DataFrame_collectDataFrame<'lo }) } -/// Bridges DataFusion's async [`SendableRecordBatchStream`] to the synchronous -/// [`RecordBatchReader`] interface that `FFI_ArrowArrayStream` (and therefore -/// the Java `ArrowReader`) consumes. Each call to `next()` drives one -/// `runtime().block_on(stream.next())`, so memory pressure stays bounded by the -/// executor pipeline plus a single in-flight batch. -pub(crate) struct StreamingReader { - pub(crate) schema: SchemaRef, - pub(crate) stream: SendableRecordBatchStream, -} - -impl Iterator for StreamingReader { - type Item = Result; - - fn next(&mut self) -> Option { - // Arrow's C ABI invokes this iterator through FFI_ArrowArrayStream's - // vtable, outside the JNI handler's try_unwrap_or_throw guard. A panic - // here (buggy UDF, arrow cast that panics, runtime poison) would - // unwind across C/FFI -- undefined behaviour. Catch it and surface as - // an ArrowError so the Java side sees a normal exception instead. - let next = catch_unwind(AssertUnwindSafe(|| runtime().block_on(self.stream.next()))); - match next { - Ok(item) => item.map(|r| r.map_err(|e| ArrowError::ExternalError(Box::new(e)))), - Err(panic) => { - let msg = if let Some(s) = panic.downcast_ref::() { - s.clone() - } else if let Some(s) = panic.downcast_ref::<&str>() { - (*s).to_string() - } else { - "rust panic with non-string payload".to_string() - }; - Some(Err(ArrowError::ExternalError( - format!("panic in DataFrame stream: {msg}").into(), - ))) - } - } - } -} - -impl RecordBatchReader for StreamingReader { - fn schema(&self) -> SchemaRef { - self.schema.clone() - } -} - #[no_mangle] pub extern "system" fn Java_org_apache_datafusion_DataFrame_executeStreamDataFrame<'local>( mut env: JNIEnv<'local>, @@ -537,35 +485,6 @@ pub extern "system" fn Java_org_apache_datafusion_DataFrame_filterRows<'local>( }) } -/// Decode a DataFusion-proto `LogicalExprNode` and apply it as a `Filter` to this DataFrame. -/// Used by the Spark connector to push V2 `Predicate`s as DataFusion `Expr` bytes (translated -/// JVM-side by `SparkPredicateTranslator`). -#[no_mangle] -pub extern "system" fn Java_org_apache_datafusion_DataFrame_filterFromProto<'local>( - mut env: JNIEnv<'local>, - _class: JClass<'local>, - handle: jlong, - expr_proto_bytes: JByteArray<'local>, -) -> jlong { - use datafusion_proto::logical_plan::from_proto::parse_expr; - use datafusion_proto::logical_plan::DefaultLogicalExtensionCodec; - use datafusion_proto::protobuf::LogicalExprNode; - - try_unwrap_or_throw(&mut env, 0, |env| -> JniResult { - if handle == 0 { - return Err("DataFrame handle is null".into()); - } - let df = unsafe { &*(handle as *const DataFrame) }.clone(); - let bytes: Vec = env.convert_byte_array(&expr_proto_bytes)?; - let node = LogicalExprNode::decode(bytes.as_slice())?; - let task_ctx = df.task_ctx(); - let extension_codec = DefaultLogicalExtensionCodec {}; - let expr = parse_expr(&node, &task_ctx, &extension_codec)?; - let new_df = df.filter(expr)?; - Ok(Box::into_raw(Box::new(new_df)) as jlong) - }) -} - #[no_mangle] pub extern "system" fn Java_org_apache_datafusion_DataFrame_limitRows<'local>( mut env: JNIEnv<'local>, diff --git a/native/src/object_store.rs b/native/src/object_store.rs index eefccf2..985d721 100644 --- a/native/src/object_store.rs +++ b/native/src/object_store.rs @@ -28,9 +28,9 @@ use std::sync::Arc; use datafusion::prelude::SessionContext; use url::Url; -use crate::errors::JniResult; use crate::proto_gen::object_store_registration::Backend; use crate::proto_gen::ObjectStoreRegistration; +use datafusion_jni_common::errors::JniResult; #[cfg(feature = "object-store-gcp")] use crate::proto_gen::GcsOptions; diff --git a/native/src/partitioned_execution.rs b/native/src/partitioned_execution.rs deleted file mode 100644 index 8ac3909..0000000 --- a/native/src/partitioned_execution.rs +++ /dev/null @@ -1,169 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Per-partition execution of a planned DataFrame. -//! -//! `Java_org_apache_datafusion_DataFrame_createPartitionedExecution` plans a -//! DataFrame exactly once and returns a handle over the resulting physical -//! plan. The handle supports concurrent `executeStreamPartition` calls from -//! multiple JVM threads -- `ExecutionPlan` and `TaskContext` are `Send + Sync` -//! and every call only clones their `Arc`s before producing an independent -//! `SendableRecordBatchStream`. Re-executing the same partition index twice -//! (Spark task retry / speculative execution) opens its own stream, but only -//! succeeds when every operator in that partition's pipeline supports repeated -//! `execute()` -- stateless scans (MemTable, table providers) do, while -//! `RepartitionExec` pipelines panic on the second call because their -//! per-partition channel receivers are single-use. -//! -//! The single unsafe interleaving is `closePartitionedExecution` racing an -//! in-flight call on the same handle. The Java consumer (the Spark connector's -//! shared-scan cache) prevents it with a refcount that covers every open -//! reader; `PartitionedExecution`'s Javadoc states the contract for any other -//! caller. - -use std::sync::Arc; - -use datafusion::arrow::datatypes::SchemaRef; -use datafusion::arrow::ffi_stream::FFI_ArrowArrayStream; -use datafusion::dataframe::DataFrame; -use datafusion::execution::TaskContext; -use datafusion::physical_plan::ExecutionPlan; -use jni::objects::JClass; -use jni::sys::{jint, jlong}; -use jni::JNIEnv; - -use crate::errors::{try_unwrap_or_throw, JniResult}; -use crate::{runtime, StreamingReader}; - -pub(crate) struct PartitionedExecutionState { - plan: Arc, - task_ctx: Arc, -} - -#[no_mangle] -pub extern "system" fn Java_org_apache_datafusion_DataFrame_createPartitionedExecution<'local>( - mut env: JNIEnv<'local>, - _class: JClass<'local>, - handle: jlong, -) -> jlong { - try_unwrap_or_throw(&mut env, 0, |_env| -> JniResult { - if handle == 0 { - return Err("DataFrame handle is null".into()); - } - // Consuming, like executeStreamDataFrame: the Java side zeroes its - // handle before calling, so this Box is the last owner. - let df = unsafe { *Box::from_raw(handle as *mut DataFrame) }; - - // task_ctx() borrows; capture it before create_physical_plan consumes - // the DataFrame. - let task_ctx = Arc::new(df.task_ctx()); - let plan = runtime().block_on(df.create_physical_plan())?; - - let state = PartitionedExecutionState { plan, task_ctx }; - Ok(Box::into_raw(Box::new(state)) as jlong) - }) -} - -#[no_mangle] -pub extern "system" fn Java_org_apache_datafusion_PartitionedExecution_partitionCountNative< - 'local, ->( - mut env: JNIEnv<'local>, - _class: JClass<'local>, - handle: jlong, -) -> jint { - try_unwrap_or_throw(&mut env, 0, |_env| -> JniResult { - if handle == 0 { - return Err("PartitionedExecution handle is null".into()); - } - let state = unsafe { &*(handle as *const PartitionedExecutionState) }; - Ok(state - .plan - .properties() - .output_partitioning() - .partition_count() as jint) - }) -} - -#[no_mangle] -pub extern "system" fn Java_org_apache_datafusion_PartitionedExecution_executeStreamPartition< - 'local, ->( - mut env: JNIEnv<'local>, - _class: JClass<'local>, - handle: jlong, - partition: jint, - ffi_stream_addr: jlong, -) { - try_unwrap_or_throw(&mut env, (), |_env| -> JniResult<()> { - if handle == 0 { - return Err("PartitionedExecution handle is null".into()); - } - if ffi_stream_addr == 0 { - return Err("ffi stream address is null".into()); - } - let state = unsafe { &*(handle as *const PartitionedExecutionState) }; - - let partition_count = state - .plan - .properties() - .output_partitioning() - .partition_count(); - if partition < 0 || partition as usize >= partition_count { - return Err(format!( - "partition index {partition} out of range: plan has {partition_count} partition(s)" - ) - .into()); - } - - let plan = Arc::clone(&state.plan); - let task_ctx = Arc::clone(&state.task_ctx); - let schema: SchemaRef = plan.schema(); - - // ExecutionPlan::execute is synchronous, but operators may - // tokio::spawn at execute() time (RepartitionExec et al.), which - // requires a runtime context to be entered. - let stream = { - let _guard = runtime().enter(); - plan.execute(partition as usize, task_ctx)? - }; - - let reader = StreamingReader { schema, stream }; - let ffi = FFI_ArrowArrayStream::new(Box::new(reader)); - unsafe { - std::ptr::write(ffi_stream_addr as *mut FFI_ArrowArrayStream, ffi); - } - Ok(()) - }) -} - -#[no_mangle] -pub extern "system" fn Java_org_apache_datafusion_PartitionedExecution_closePartitionedExecution< - 'local, ->( - mut env: JNIEnv<'local>, - _class: JClass<'local>, - handle: jlong, -) { - try_unwrap_or_throw(&mut env, (), |_env| -> JniResult<()> { - if handle == 0 { - return Err("PartitionedExecution handle is null".into()); - } - drop(unsafe { Box::from_raw(handle as *mut PartitionedExecutionState) }); - Ok(()) - }) -} diff --git a/native/src/proto.rs b/native/src/proto.rs index 4f187bc..c1315f9 100644 --- a/native/src/proto.rs +++ b/native/src/proto.rs @@ -28,8 +28,8 @@ use jni::sys::{jbyteArray, jlong}; use jni::JNIEnv; use prost::Message; -use crate::errors::{try_unwrap_or_throw, JniResult}; use crate::runtime; +use datafusion_jni_common::errors::{try_unwrap_or_throw, JniResult}; #[no_mangle] pub extern "system" fn Java_org_apache_datafusion_SessionContext_createDataFrameFromProto< diff --git a/native/src/runtime_metrics.rs b/native/src/runtime_metrics.rs index ddd8698..dd60dcb 100644 --- a/native/src/runtime_metrics.rs +++ b/native/src/runtime_metrics.rs @@ -38,7 +38,7 @@ //! 10 totalOverflowCount #[cfg(not(feature = "runtime-metrics"))] -use crate::errors::JniResult; +use datafusion_jni_common::errors::JniResult; /// Number of i64 values in the snapshot array; kept here so the Java side and /// the feature-off stub agree on the layout. @@ -51,7 +51,7 @@ mod imp { use tokio_metrics::{RuntimeIntervals, RuntimeMonitor}; use super::STATS_FIELD_COUNT; - use crate::errors::JniResult; + use datafusion_jni_common::errors::JniResult; /// `RuntimeMonitor::intervals().next()` returns *delta* metrics covering /// the period since the previous call (or, on the very first call, since diff --git a/native/src/schema.rs b/native/src/schema.rs index 968a73a..0c3c7ab 100644 --- a/native/src/schema.rs +++ b/native/src/schema.rs @@ -20,7 +20,7 @@ use datafusion::arrow::ipc::reader::StreamReader; use jni::objects::JByteArray; use jni::JNIEnv; -use crate::errors::JniResult; +use datafusion_jni_common::errors::JniResult; /// Decode an optional Arrow-IPC schema byte array passed in from Java. /// Returns `None` if the byte-array reference is null. diff --git a/spark/native/Cargo.toml b/spark/native/Cargo.toml index 5f42bf2..bd9d423 100644 --- a/spark/native/Cargo.toml +++ b/spark/native/Cargo.toml @@ -24,6 +24,12 @@ arrow = { workspace = true } async-trait = { workspace = true } datafusion = { workspace = true } datafusion-ffi = { workspace = true } +# Shared JNI plumbing: error->Java-exception mapping and the per-cdylib Tokio +# runtime singleton. The thrown classes (org.apache.datafusion.*) come from +# the datafusion-java core jar, which the Spark module already depends on. +datafusion-jni-common = { path = "../../native-common" } +datafusion-proto = { workspace = true } futures = { workspace = true } jni = { workspace = true } +prost = { workspace = true } tokio = { workspace = true } diff --git a/spark/native/src/lib.rs b/spark/native/src/lib.rs index 8c50a00..038d3f9 100644 --- a/spark/native/src/lib.rs +++ b/spark/native/src/lib.rs @@ -15,114 +15,23 @@ // specific language governing permissions and limitations // under the License. -//! Widening cdylib for the generic Spark connector. +//! Native side of the generic Spark connector. //! -//! Single JNI entry point: `wrapWithWidening(jlong) -> jlong`. Takes a raw -//! `FFI_TableProvider` pointer produced by a bridge cdylib, wraps the inner -//! `TableProvider` in a [`WideningTableProvider`] that exposes +//! Takes raw `FFI_TableProvider` pointers produced by a bridge cdylib and +//! does everything DataFusion-side in process: schema probe, widening to //! Spark-compatible Arrow types (UInt*→signed wider, Float16→Float32, -//! Time*→Int wider, Timestamp(*, tz)→Timestamp(Microsecond, tz)), and -//! re-FFIs the result for the consumer (datafusion-java's cdylib). -//! -//! No SessionContext or SQL — kernel-level `arrow::compute::cast` only. - -use std::error::Error; -use std::panic::{catch_unwind, AssertUnwindSafe}; -use std::sync::{Arc, OnceLock}; +//! Time*→Int wider, Timestamp(*, tz)→Timestamp(Microsecond, tz)), session +//! construction from the driver-pinned config, projection + proto-filter +//! application, planning, and per-partition stream execution. See [`scan`] +//! for the JNI surface and [`widening`] for the cast layer. -use datafusion::catalog::TableProvider; -use datafusion::execution::TaskContextProvider; -use datafusion::prelude::SessionContext; -use datafusion_ffi::execution::FFI_TaskContextProvider; -use datafusion_ffi::table_provider::FFI_TableProvider; -use jni::objects::JClass; -use jni::sys::jlong; -use jni::JNIEnv; -use tokio::runtime::{Handle, Runtime}; +use tokio::runtime::Handle; +pub mod scan; pub mod widening; -use widening::WideningTableProvider; - -type JniResult = Result>; - -/// Shared Tokio runtime. The widening cdylib does not itself await any IO, -/// but the FFI_TableProvider it produces is registered on a foreign -/// SessionContext that may schedule work via this handle. +/// Shared Tokio runtime (the per-cdylib singleton from +/// `datafusion-jni-common`). Planning and stream execution all run on it. fn runtime() -> &'static Handle { - static RUNTIME: OnceLock = OnceLock::new(); - RUNTIME - .get_or_init(|| Runtime::new().expect("tokio runtime init failed")) - .handle() -} - -/// Shared "host" SessionContext within the widening cdylib. Only used as -/// the source of a `TaskContextProvider` passed into `FFI_TableProvider::new`. -/// Lives for the lifetime of the cdylib; no datasets are ever registered on it. -fn host_session_context() -> &'static Arc { - static CTX: OnceLock> = OnceLock::new(); - CTX.get_or_init(|| Arc::new(SessionContext::new())) -} - -#[no_mangle] -pub extern "system" fn Java_io_datafusion_spark_FfiHelperNative_wrapWithWidening<'local>( - mut env: JNIEnv<'local>, - _class: JClass<'local>, - ffi_raw_ptr: jlong, -) -> jlong { - try_unwrap_or_throw(&mut env, 0, |_env| { - if ffi_raw_ptr == 0 { - return Err("wrapWithWidening: input FFI_TableProvider pointer is null".into()); - } - // Take ownership of the producer's FFI_TableProvider. - let ffi_raw: Box = - unsafe { Box::from_raw(ffi_raw_ptr as *mut FFI_TableProvider) }; - - // Cross-cdylib hop: `Arc::::from(&FFI_TableProvider)` - // returns a `ForeignTableProvider` wrapper that delegates back through - // the producer's vtable. Drop our `Box` immediately afterward — the - // ForeignTableProvider clone owns its own retained copy. - let inner: Arc = (&*ffi_raw).into(); - drop(ffi_raw); - - let widened: Arc = Arc::new(WideningTableProvider::new(inner)); - - // Re-wrap as an FFI_TableProvider for the consumer. - let ctx_provider: Arc = - Arc::clone(host_session_context()) as Arc; - let ffi_task_ctx = FFI_TaskContextProvider::from(&ctx_provider); - let ffi = FFI_TableProvider::new( - widened, - /*can_support_pushdown_filters=*/ true, - Some(runtime().clone()), - ffi_task_ctx, - /*logical_codec=*/ None, - ); - Ok(Box::into_raw(Box::new(ffi)) as jlong) - }) -} - -/// Run `f`, catching panics and translating `Err` into a plain Java -/// `RuntimeException`. The connector-core helper does not know about -/// datafusion-java's exception hierarchy, so this stays minimal. -fn try_unwrap_or_throw(env: &mut JNIEnv, default: T, f: F) -> T -where - F: FnOnce(&mut JNIEnv) -> JniResult, -{ - match catch_unwind(AssertUnwindSafe(|| f(env))) { - Ok(Ok(value)) => value, - Ok(Err(err)) => { - let _ = env.throw_new("java/lang/RuntimeException", err.to_string()); - default - } - Err(panic) => { - let msg = panic - .downcast_ref::<&'static str>() - .map(|s| s.to_string()) - .or_else(|| panic.downcast_ref::().cloned()) - .unwrap_or_else(|| "rust panic in widening cdylib".to_string()); - let _ = env.throw_new("java/lang/RuntimeException", format!("panic: {msg}")); - default - } - } + datafusion_jni_common::runtime().handle() } diff --git a/spark/native/src/scan.rs b/spark/native/src/scan.rs new file mode 100644 index 0000000..aacfd45 --- /dev/null +++ b/spark/native/src/scan.rs @@ -0,0 +1,350 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Planning and execution of a Spark scan, entirely inside this cdylib. +//! +//! `createScan` takes ownership of a bridge's `FFI_TableProvider` pointer, +//! wraps the inner provider in a [`WideningTableProvider`] (in-process — no +//! re-FFI hop), registers it on a private `SessionContext` built from the +//! caller-pinned config, applies the pruned projection and the proto-encoded +//! pushed filters, and plans exactly once. The returned handle supports: +//! +//! - `partitionCount` — output partitions of the physical plan (shared-scan +//! mode probes this on the driver and indexes tasks by it); +//! - `executeStreamPartition` — an independent stream over ONE plan +//! partition, concurrently callable from multiple JVM threads +//! (`ExecutionPlan` and `TaskContext` are `Send + Sync`; each call only +//! clones their `Arc`s). Re-executing the same partition index (Spark task +//! retry / speculative execution) opens its own stream, but only succeeds +//! when every operator in that partition's pipeline supports repeated +//! `execute()` — stateless scans do, `RepartitionExec` pipelines do not; +//! - `executeStream` — the whole plan as one stream (legacy per-partition +//! payload mode, where the provider itself is the task's slice); +//! - `closeScan` — drop the plan. The single unsafe interleaving is closing +//! a handle that still has an in-flight call; the Java consumer (the +//! shared-scan cache) prevents it with a refcount covering every open +//! reader. +//! +//! Pinned-config determinism: the driver resolves `target_partitions` / +//! `batch_size` / option overrides once and ships them to every executor, so +//! a plan that yields N partitions on the driver yields N everywhere. This +//! module applies whatever it is handed and stays policy-free. + +use std::sync::Arc; + +use datafusion::arrow::datatypes::SchemaRef; +use datafusion::arrow::ffi_stream::FFI_ArrowArrayStream; +use datafusion::arrow::ipc::writer::StreamWriter; +use datafusion::catalog::TableProvider; +use datafusion::dataframe::DataFrame; +use datafusion::execution::TaskContext; +use datafusion::physical_plan::{execute_stream, ExecutionPlan}; +use datafusion::prelude::{SessionConfig, SessionContext}; +use datafusion_ffi::table_provider::FFI_TableProvider; +use datafusion_jni_common::errors::{try_unwrap_or_throw, JniResult}; +use datafusion_jni_common::StreamingReader; +use datafusion_proto::logical_plan::from_proto::parse_expr; +use datafusion_proto::logical_plan::DefaultLogicalExtensionCodec; +use datafusion_proto::protobuf::LogicalExprNode; +use jni::objects::{JByteArray, JClass, JObjectArray, JString}; +use jni::sys::{jbyteArray, jint, jlong}; +use jni::JNIEnv; +use prost::Message; + +use crate::runtime; +use crate::widening::WideningTableProvider; + +/// Registration name of the (single) provider on the scan's private context. +/// Never surfaces in SQL — the plan is built through the DataFrame API — so +/// no quoting/collision concerns. +const SCAN_TABLE_NAME: &str = "df_spark_scan"; + +struct ScanState { + /// Kept alive for the plan's lifetime; the registered provider and the + /// runtime env both hang off it. + _ctx: SessionContext, + plan: Arc, + task_ctx: Arc, +} + +/// Take ownership of the bridge's `FFI_TableProvider` pointer and return the +/// widened in-process provider. +fn import_widened(ffi_raw_ptr: jlong) -> JniResult> { + if ffi_raw_ptr == 0 { + return Err("FFI_TableProvider pointer is null".into()); + } + let ffi_raw: Box = + unsafe { Box::from_raw(ffi_raw_ptr as *mut FFI_TableProvider) }; + // `Arc::::from(&FFI_TableProvider)` returns a + // ForeignTableProvider that delegates through the producer's vtable; it + // owns its own retained copy, so our Box can drop immediately. + let inner: Arc = (&*ffi_raw).into(); + drop(ffi_raw); + Ok(Arc::new(WideningTableProvider::new(inner))) +} + +fn collect_string_array(env: &mut JNIEnv, arr: &JObjectArray) -> JniResult> { + if arr.is_null() { + return Ok(Vec::new()); + } + let len = env.get_array_length(arr)?; + let mut owned: Vec = Vec::with_capacity(len as usize); + for i in 0..len { + let elem = env.get_object_array_element(arr, i)?; + let jstr: JString = elem.into(); + owned.push(env.get_string(&jstr)?.into()); + } + Ok(owned) +} + +fn collect_byte_arrays(env: &mut JNIEnv, arr: &JObjectArray) -> JniResult>> { + if arr.is_null() { + return Ok(Vec::new()); + } + let len = env.get_array_length(arr)?; + let mut owned: Vec> = Vec::with_capacity(len as usize); + for i in 0..len { + let elem = env.get_object_array_element(arr, i)?; + let bytes: JByteArray = elem.into(); + owned.push(env.convert_byte_array(&bytes)?); + } + Ok(owned) +} + +/// Driver-side schema probe: widened Arrow schema of the provider, as IPC +/// bytes (deserialized JVM-side with `MessageSerializer.deserializeSchema`). +/// Takes ownership of the pointer; the provider drops before returning. +#[no_mangle] +pub extern "system" fn Java_io_datafusion_spark_FfiHelperNative_providerSchemaIpc<'local>( + mut env: JNIEnv<'local>, + _class: JClass<'local>, + ffi_raw_ptr: jlong, +) -> jbyteArray { + try_unwrap_or_throw( + &mut env, + std::ptr::null_mut(), + |env| -> JniResult { + let widened = import_widened(ffi_raw_ptr)?; + let schema = widened.schema(); + let mut buf: Vec = Vec::new(); + { + let mut writer = StreamWriter::try_new(&mut buf, schema.as_ref())?; + writer.finish()?; + } + let arr = env.byte_array_from_slice(&buf)?; + Ok(arr.into_raw()) + }, + ) +} + +/// Build the scan: widen the provider, register it on a private context with +/// the pinned config, apply projection + pushed filters, plan once. +/// +/// `target_partitions` / `batch_size` <= 0 leave the DataFusion defaults; +/// `option_keys`/`option_values` are parallel arrays of config overrides; +/// empty `projection_columns` selects all columns; each element of +/// `filter_protos` is a serialized `datafusion.LogicalExprNode`. +#[no_mangle] +pub extern "system" fn Java_io_datafusion_spark_FfiHelperNative_createScan<'local>( + mut env: JNIEnv<'local>, + _class: JClass<'local>, + ffi_raw_ptr: jlong, + target_partitions: jint, + batch_size: jint, + option_keys: JObjectArray<'local>, + option_values: JObjectArray<'local>, + projection_columns: JObjectArray<'local>, + filter_protos: JObjectArray<'local>, +) -> jlong { + try_unwrap_or_throw(&mut env, 0, |env| -> JniResult { + let widened = import_widened(ffi_raw_ptr)?; + + let keys = collect_string_array(env, &option_keys)?; + let values = collect_string_array(env, &option_values)?; + if keys.len() != values.len() { + return Err(format!( + "option key/value arrays differ in length: {} vs {}", + keys.len(), + values.len() + ) + .into()); + } + let projection = collect_string_array(env, &projection_columns)?; + let filters = collect_byte_arrays(env, &filter_protos)?; + + let mut config = SessionConfig::new(); + if target_partitions > 0 { + config = config.with_target_partitions(target_partitions as usize); + } + if batch_size > 0 { + config = config.with_batch_size(batch_size as usize); + } + for (key, value) in keys.iter().zip(values.iter()) { + config.options_mut().set(key, value)?; + } + + let ctx = SessionContext::new_with_config(config); + ctx.register_table(SCAN_TABLE_NAME, widened)?; + + let mut df: DataFrame = runtime().block_on(ctx.table(SCAN_TABLE_NAME))?; + if !projection.is_empty() { + let refs: Vec<&str> = projection.iter().map(String::as_str).collect(); + df = df.select_columns(&refs)?; + } + for bytes in &filters { + let node = LogicalExprNode::decode(bytes.as_slice())?; + // TaskContext implements FunctionRegistry; the default codec is + // enough because the translator only emits column/literal/builtin + // expressions. + let registry = df.task_ctx(); + let expr = parse_expr(&node, ®istry, &DefaultLogicalExtensionCodec {})?; + df = df.filter(expr)?; + } + + // task_ctx() borrows; capture before create_physical_plan consumes df. + let task_ctx = Arc::new(df.task_ctx()); + let plan = runtime().block_on(df.create_physical_plan())?; + + let state = ScanState { + _ctx: ctx, + plan, + task_ctx, + }; + Ok(Box::into_raw(Box::new(state)) as jlong) + }) +} + +#[no_mangle] +pub extern "system" fn Java_io_datafusion_spark_FfiHelperNative_partitionCount<'local>( + mut env: JNIEnv<'local>, + _class: JClass<'local>, + handle: jlong, +) -> jint { + try_unwrap_or_throw(&mut env, 0, |_env| -> JniResult { + if handle == 0 { + return Err("scan handle is null".into()); + } + let state = unsafe { &*(handle as *const ScanState) }; + Ok(state + .plan + .properties() + .output_partitioning() + .partition_count() as jint) + }) +} + +#[no_mangle] +pub extern "system" fn Java_io_datafusion_spark_FfiHelperNative_executeStreamPartition<'local>( + mut env: JNIEnv<'local>, + _class: JClass<'local>, + handle: jlong, + partition: jint, + ffi_stream_addr: jlong, +) { + try_unwrap_or_throw(&mut env, (), |_env| -> JniResult<()> { + if handle == 0 { + return Err("scan handle is null".into()); + } + if ffi_stream_addr == 0 { + return Err("ffi stream address is null".into()); + } + let state = unsafe { &*(handle as *const ScanState) }; + + let partition_count = state + .plan + .properties() + .output_partitioning() + .partition_count(); + if partition < 0 || partition as usize >= partition_count { + return Err(format!( + "partition index {partition} out of range: plan has {partition_count} partition(s)" + ) + .into()); + } + + let plan = Arc::clone(&state.plan); + let task_ctx = Arc::clone(&state.task_ctx); + let schema: SchemaRef = plan.schema(); + + // ExecutionPlan::execute is synchronous, but operators may + // tokio::spawn at execute() time (RepartitionExec et al.), which + // requires a runtime context to be entered. + let stream = { + let _guard = runtime().enter(); + plan.execute(partition as usize, task_ctx)? + }; + + let reader = StreamingReader { schema, stream }; + let ffi = FFI_ArrowArrayStream::new(Box::new(reader)); + unsafe { + std::ptr::write(ffi_stream_addr as *mut FFI_ArrowArrayStream, ffi); + } + Ok(()) + }) +} + +/// Whole-plan stream for legacy per-partition payload mode (the provider +/// itself is the task's slice, so all plan partitions merge into one reader). +#[no_mangle] +pub extern "system" fn Java_io_datafusion_spark_FfiHelperNative_executeStream<'local>( + mut env: JNIEnv<'local>, + _class: JClass<'local>, + handle: jlong, + ffi_stream_addr: jlong, +) { + try_unwrap_or_throw(&mut env, (), |_env| -> JniResult<()> { + if handle == 0 { + return Err("scan handle is null".into()); + } + if ffi_stream_addr == 0 { + return Err("ffi stream address is null".into()); + } + let state = unsafe { &*(handle as *const ScanState) }; + + let plan = Arc::clone(&state.plan); + let task_ctx = Arc::clone(&state.task_ctx); + let schema: SchemaRef = plan.schema(); + + // execute_stream coalesces multi-partition plans behind one stream. + let stream = { + let _guard = runtime().enter(); + execute_stream(plan, task_ctx)? + }; + + let reader = StreamingReader { schema, stream }; + let ffi = FFI_ArrowArrayStream::new(Box::new(reader)); + unsafe { + std::ptr::write(ffi_stream_addr as *mut FFI_ArrowArrayStream, ffi); + } + Ok(()) + }) +} + +#[no_mangle] +pub extern "system" fn Java_io_datafusion_spark_FfiHelperNative_closeScan<'local>( + mut env: JNIEnv<'local>, + _class: JClass<'local>, + handle: jlong, +) { + try_unwrap_or_throw(&mut env, (), |_env| -> JniResult<()> { + if handle == 0 { + return Err("scan handle is null".into()); + } + drop(unsafe { Box::from_raw(handle as *mut ScanState) }); + Ok(()) + }) +} diff --git a/spark/src/main/java/io/datafusion/spark/FfiHelperNative.java b/spark/src/main/java/io/datafusion/spark/FfiHelperNative.java index 8def9d4..2524c61 100644 --- a/spark/src/main/java/io/datafusion/spark/FfiHelperNative.java +++ b/spark/src/main/java/io/datafusion/spark/FfiHelperNative.java @@ -20,14 +20,17 @@ package io.datafusion.spark; /** - * JNI hooks into the connector-core widening cdylib ({@code - * libdatafusion_spark_helper.{so,dylib}}). + * JNI surface of the connector cdylib ({@code libdatafusion_spark_helper.{so,dylib}}). * - *

    The widening cdylib unwraps an FFI_TableProvider pointer produced by a bridge, wraps it in a - * {@code WideningTableProvider} that applies kernel-level {@code arrow::compute::cast} on incoming - * RecordBatches for any Spark-incompatible Arrow type (unsigned ints, Float16, Time, - * non-microsecond Timestamp, recursive List), and re-FFIs it for the consumer (datafusion-java's - * cdylib via {@code SessionContext.registerFfiTable}). + *

    The cdylib owns the whole DataFusion side of a scan: it takes an {@code FFI_TableProvider} + * pointer produced by a bridge, wraps the provider in a {@code WideningTableProvider} (kernel-level + * {@code arrow::compute::cast} for Spark-incompatible Arrow types), registers it on a private + * {@code SessionContext} built from the driver-pinned config, applies the pruned projection and the + * proto-encoded pushed filters, plans once, and streams plan partitions back over {@code + * FFI_ArrowArrayStream}. + * + *

    Errors throw the typed {@code org.apache.datafusion.*} exception hierarchy (from the + * datafusion-java core jar, a compile dependency of this module). * *

    The native library is loaded once per JVM via {@link NativeLibraryLoader}. The library payload * lives inside this jar under {@code io/datafusion/spark///} and is extracted to a temp @@ -42,11 +45,52 @@ private FfiHelperNative() {} } /** - * Take ownership of an {@code FFI_TableProvider} pointer produced by a bridge cdylib, wrap it in - * a {@code WideningTableProvider}, and re-wrap the result as a fresh {@code FFI_TableProvider}. - * Returns the new raw pointer; the caller owns it. + * Driver-side schema probe: the widened Arrow schema of the provider, serialized as Arrow IPC + * bytes (deserialize with {@code MessageSerializer.deserializeSchema}). * - *

    The input pointer must not be reused after this call returns: ownership transfers. + *

    Takes ownership of {@code ffiProviderRawPtr}; the provider is dropped before returning and + * the pointer must not be reused. + */ + public static native byte[] providerSchemaIpc(long ffiProviderRawPtr); + + /** + * Build a planned scan over the provider and return its handle. + * + *

    Takes ownership of {@code ffiProviderRawPtr}. {@code targetPartitions} / {@code batchSize} + * {@code <= 0} leave the DataFusion defaults; {@code optionKeys}/{@code optionValues} are + * parallel arrays of DataFusion config overrides; an empty {@code projectionColumns} selects all + * columns; each element of {@code filterProtos} is a serialized {@code datafusion.LogicalExprNode} + * applied as a filter. + * + *

    The caller owns the returned handle and must pair it with {@link #closeScan(long)}. Closing + * while a stream opened from this handle is still in flight is undefined behaviour — the + * shared-scan cache's refcount enforces this; any other caller must serialize close itself. + */ + public static native long createScan( + long ffiProviderRawPtr, + int targetPartitions, + int batchSize, + String[] optionKeys, + String[] optionValues, + String[] projectionColumns, + byte[][] filterProtos); + + /** Output partition count of the planned physical plan. */ + public static native int partitionCount(long scanHandle); + + /** + * Open an independent stream over ONE plan partition, writing an {@code FFI_ArrowArrayStream} + * into the caller-allocated struct at {@code ffiStreamAddr}. Concurrent-safe across JVM threads. */ - public static native long wrapWithWidening(long ffiProviderRawPtr); + public static native void executeStreamPartition(long scanHandle, int partition, long ffiStreamAddr); + + /** + * Stream the WHOLE plan (all partitions coalesced) into the caller-allocated {@code + * FFI_ArrowArrayStream} at {@code ffiStreamAddr}. Used by legacy per-partition payload mode, + * where the provider itself already represents the task's slice. + */ + public static native void executeStream(long scanHandle, long ffiStreamAddr); + + /** Drop the planned scan. See {@link #createScan} for the close-vs-in-flight-stream contract. */ + public static native void closeScan(long scanHandle); } diff --git a/spark/src/main/java/io/datafusion/spark/FfiProviderFactory.java b/spark/src/main/java/io/datafusion/spark/FfiProviderFactory.java index 506cd66..2003016 100644 --- a/spark/src/main/java/io/datafusion/spark/FfiProviderFactory.java +++ b/spark/src/main/java/io/datafusion/spark/FfiProviderFactory.java @@ -73,7 +73,7 @@ public interface FfiProviderFactory { /** * Filter-aware variant of {@link #listPartitions(byte[])}. The connector calls this overload with * the pushed-down predicates ({@code LogicalExprNode} proto bytes, one array per predicate, same - * encoding the executor later replays via {@code DataFrame.filterFromProto}). Bridges that can + * encoding the executor later replays via {@code FfiHelperNative.createScan}). Bridges that can * map predicates onto their partition layout (e.g. {@code segment_id = 'x'}) should prune * partitions that cannot match — pruning here eliminates whole Spark tasks, whereas the per-task * filter only reduces rows inside a task. diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionColumnarPartitionReader.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionColumnarPartitionReader.scala index 4a357d6..c59778d 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionColumnarPartitionReader.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionColumnarPartitionReader.scala @@ -21,7 +21,6 @@ package io.datafusion.spark import org.apache.arrow.memory.RootAllocator import org.apache.arrow.vector.ipc.ArrowReader -import org.apache.datafusion.{DataFrame, SessionContext} import org.apache.spark.sql.connector.read.PartitionReader import org.apache.spark.sql.types.StructType import org.apache.spark.sql.vectorized.ColumnarBatch @@ -33,14 +32,10 @@ import org.apache.spark.sql.vectorized.ColumnarBatch * 2. `createProvider(optionsProtoBytes, partitionBytes)` — bridge builds an `Arc` materialising the slice described by `partitionBytes`, wraps it in an * `FFI_TableProvider`, returns the raw pointer. - * 3. Hand that pointer to connector-core's widening cdylib via `FfiHelperNative.wrapWithWidening`. - * The cdylib wraps the inner provider in a `WideningTableProvider` (kernel-level - * `arrow::compute::cast` for Spark-incompatible Arrow types) and re-FFIs it. - * 4. Register the widened pointer on a fresh `SessionContext` via `registerFfiTable`. - * 5. Build a `SELECT projection FROM

    ` DataFrame; apply pushed filters via - * `DataFrame.filterFromProto`, closing each intermediate frame. - * 6. `executeStream` returns an `ArrowReader`; batches surface through - * [[ArrowColumnarBatchIteration]]. + * 3. `FfiHelperNative.createScan` does the rest natively: widening wrap, private + * `SessionContext`, projection, pushed proto filters, physical plan. + * 4. `executeStream` streams the whole plan (the provider already IS the task's slice); + * batches surface through [[ArrowColumnarBatchIteration]]. */ class DatafusionColumnarPartitionReader( partition: DatafusionInputPartition, @@ -49,26 +44,41 @@ class DatafusionColumnarPartitionReader( with ArrowColumnarBatchIteration { private val allocator = new RootAllocator(Long.MaxValue) - private val ctx: SessionContext = new SessionContext() private val factory: FfiProviderFactory = instantiateFactory(partition.factoryFqcn) - override protected val arrowReader: ArrowReader = { - val rawPtr = factory.createProvider(partition.optionsProtoBytes, partition.partitionBytes) - val widenedPtr = FfiHelperNative.wrapWithWidening(rawPtr) - ctx.registerFfiTable(DatafusionSqlBuilder.PartitionTableName, widenedPtr) - var df: DataFrame = ctx.sql( - DatafusionSqlBuilder - .buildSql(partition.projectionColumnNames, DatafusionSqlBuilder.PartitionTableName)) - var i = 0 - while (i < partition.filterProtoBytes.length) { - val filtered = df.filterFromProto(partition.filterProtoBytes(i)) - df.close() - df = filtered - i += 1 + private val scanHandle: Long = + try { + val rawPtr = factory.createProvider(partition.optionsProtoBytes, partition.partitionBytes) + FfiHelperNative.createScan( + rawPtr, + /* targetPartitions = */ -1, + /* batchSize = */ -1, + Array.empty[String], + Array.empty[String], + partition.projectionColumnNames, + partition.filterProtoBytes + ) + } catch { + case t: Throwable => + try allocator.close() + catch { case suppressed: Throwable => t.addSuppressed(suppressed) } + throw t + } + + override protected val arrowReader: ArrowReader = + try { + FfiStream.importReader(allocator) { addr => + FfiHelperNative.executeStream(scanHandle, addr) + } + } catch { + case t: Throwable => + try FfiHelperNative.closeScan(scanHandle) + catch { case suppressed: Throwable => t.addSuppressed(suppressed) } + try allocator.close() + catch { case suppressed: Throwable => t.addSuppressed(suppressed) } + throw t } - df.executeStream(allocator) - } override def close(): Unit = { var first: Throwable = null @@ -76,7 +86,7 @@ class DatafusionColumnarPartitionReader( try f catch { case t: Throwable => if (first == null) first = t else first.addSuppressed(t) } safe(arrowReader.close()) - safe(ctx.close()) + safe(FfiHelperNative.closeScan(scanHandle)) safe(allocator.close()) if (first != null) throw first } diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionInputPartition.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionInputPartition.scala index 03d2c2e..9fbe070 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionInputPartition.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionInputPartition.scala @@ -38,7 +38,7 @@ sealed trait DatafusionPartition extends InputPartition * Opaque to connector-core. Same bytes ride along on every partition. * - `projectionColumnNames`: pruned column list (post-`pruneColumns`). * - `filterProtoBytes`: V2 `Predicate` → DataFusion `LogicalExprNode` proto bytes; each one is - * applied via `DataFrame.filterFromProto`. + * applied natively via `FfiHelperNative.createScan`. * - `partitionId`: stable identifier (e.g. Rerun segment id) — surfaces in Spark UI/logs/errors. * - `partitionBytes`: opaque per-partition payload from `PartitionInfo.partitionBytes`. Passed * back into `createProvider` so the bridge materialises *this* slice. diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionScan.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionScan.scala index d3931ce..80f35bd 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionScan.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionScan.scala @@ -59,7 +59,7 @@ final case class SharedScanMode( /** * Read plan for a DataFusion-backed scan. Holds pruning state, the pushed predicates (for * `description()` / `explain(True)`), the corresponding `LogicalExprNode` proto byte arrays the - * executor applies via `DataFrame.filterFromProto`, and the driver-resolved + * executor applies natively via `FfiHelperNative.createScan`, and the driver-resolved * [[DatafusionScanMode]]. * * Legacy mode with a bridge-declared [[ReportedPartitioning]] surfaces `KeyGroupedPartitioning` diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionSource.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionSource.scala index bcf0f99..fd1d66c 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionSource.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionSource.scala @@ -19,9 +19,12 @@ package io.datafusion.spark +import java.io.ByteArrayInputStream +import java.nio.channels.Channels import java.util -import org.apache.datafusion.SessionContext +import org.apache.arrow.vector.ipc.ReadChannel +import org.apache.arrow.vector.ipc.message.MessageSerializer import org.apache.spark.sql.connector.catalog.{Table, TableProvider} import org.apache.spark.sql.connector.expressions.Transform import org.apache.spark.sql.sources.DataSourceRegister @@ -33,11 +36,11 @@ import org.apache.spark.sql.util.CaseInsensitiveStringMap * - Subclass and override [[shortName]] + [[factoryFqcn]] (the rerun-connector pattern), or * - Use this class directly with `option("df.factory", "fully.qualified.FactoryClass")`. * - * Schema discovery happens driver-side via a transient SessionContext: the factory's - * `FFI_TableProvider` is built, wrapped with the widening cdylib, registered on the context, and - * its Arrow schema read via `tableSchema(name)`. The same `optionsProtoBytes` (and the factory - * FQCN) is then carried verbatim through `DatafusionInputPartition`, so each executor task - * repeats the same factory → wrapWithWidening → registerFfiTable pipeline locally. + * Schema discovery happens driver-side inside the connector cdylib: the factory's + * `FFI_TableProvider` is built and handed to `FfiHelperNative.providerSchemaIpc`, which widens it + * and returns its Arrow schema as IPC bytes. The same `optionsProtoBytes` (and the factory FQCN) + * is then carried verbatim through `DatafusionInputPartition`, so each executor task repeats the + * same factory → createScan pipeline locally. */ class DatafusionSource extends TableProvider with DataSourceRegister { @@ -64,17 +67,12 @@ class DatafusionSource extends TableProvider with DataSourceRegister { val fqcn = factoryFqcn(options) val factory = instantiateFactory(fqcn) val optionsBytes = factory.encodeOptions(options.asCaseSensitiveMap()) - val arrowSchema = { - val ctx = new SessionContext() - try { - // Schema probe: pass empty partitionBytes — bridges are required to honour an empty - // payload for the driver-side probe (schema must not depend on per-partition state). - val rawPtr = factory.createProvider(optionsBytes, Array.emptyByteArray) - val widenedPtr = FfiHelperNative.wrapWithWidening(rawPtr) - ctx.registerFfiTable("__df_schema_probe__", widenedPtr) - ctx.tableSchema("__df_schema_probe__") - } finally ctx.close() - } + // Schema probe: pass empty partitionBytes — bridges are required to honour an empty + // payload for the driver-side probe (schema must not depend on per-partition state). + val rawPtr = factory.createProvider(optionsBytes, Array.emptyByteArray) + val ipcBytes = FfiHelperNative.providerSchemaIpc(rawPtr) + val arrowSchema = MessageSerializer.deserializeSchema( + new ReadChannel(Channels.newChannel(new ByteArrayInputStream(ipcBytes)))) ArrowToSparkSchema.toSparkSchema(arrowSchema) } diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionSqlBuilder.scala b/spark/src/main/scala/io/datafusion/spark/FfiStream.scala similarity index 52% rename from spark/src/main/scala/io/datafusion/spark/DatafusionSqlBuilder.scala rename to spark/src/main/scala/io/datafusion/spark/FfiStream.scala index a209bed..eb1149a 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionSqlBuilder.scala +++ b/spark/src/main/scala/io/datafusion/spark/FfiStream.scala @@ -19,23 +19,26 @@ package io.datafusion.spark -/** Shared SQL construction for the per-task (legacy) and shared-scan read paths. */ -private[spark] object DatafusionSqlBuilder { +import org.apache.arrow.c.{ArrowArrayStream, Data} +import org.apache.arrow.memory.BufferAllocator +import org.apache.arrow.vector.ipc.ArrowReader - /** Registration name for the per-task provider in legacy mode. */ - val PartitionTableName = "df_spark_partition" - - /** Registration name for the per-executor provider in shared-scan mode. */ - val SharedTableName = "df_spark_shared" +/** + * Arrow C-data import of a native-produced `FFI_ArrowArrayStream`: allocate the empty struct, + * let the native side write into it, then hand it to Arrow Java. On any failure the struct is + * released so a half-written stream can't leak. + */ +private[spark] object FfiStream { - /** `SELECT FROM "
    "`. */ - def buildSql(projectionColumnNames: Array[String], tableName: String): String = { - val cols = - if (projectionColumnNames.isEmpty) "*" - else - projectionColumnNames - .map(c => "\"" + c.replace("\"", "\"\"") + "\"") - .mkString(", ") - s"""SELECT $cols FROM "$tableName"""" + def importReader(allocator: BufferAllocator)(writeStream: Long => Unit): ArrowReader = { + val stream = ArrowArrayStream.allocateNew(allocator) + try { + writeStream(stream.memoryAddress()) + Data.importArrayStream(allocator, stream) + } catch { + case t: Throwable => + stream.close() + throw t + } } } diff --git a/spark/src/main/scala/io/datafusion/spark/NativeSharedScanResources.scala b/spark/src/main/scala/io/datafusion/spark/NativeSharedScanResources.scala index 2351484..3009737 100644 --- a/spark/src/main/scala/io/datafusion/spark/NativeSharedScanResources.scala +++ b/spark/src/main/scala/io/datafusion/spark/NativeSharedScanResources.scala @@ -21,25 +21,23 @@ package io.datafusion.spark import org.apache.arrow.memory.{BufferAllocator, RootAllocator} import org.apache.arrow.vector.ipc.ArrowReader -import org.apache.datafusion.{DataFrame, PartitionedExecution, SessionContext} import org.apache.spark.internal.Logging /** - * JNI-backed shared-scan entry: one provider, one `SessionContext`, one planned - * [[PartitionedExecution]]. + * JNI-backed shared-scan entry: one provider, one planned scan handle inside the connector + * cdylib. * * The build sequence is the single code path for BOTH the driver-side partition-count probe and - * every executor's cache entry — identical widening, registration, SQL, filters, and pinned - * session config are what make the partition count comparable across machines (the bridge's - * determinism contract covers the rest). + * every executor's cache entry — identical widening, registration, projection, filters, and + * pinned session config are what make the partition count comparable across machines (the + * bridge's determinism contract covers the rest). */ private[spark] final class NativeSharedScanResources( allocator: RootAllocator, - ctx: SessionContext, - execution: PartitionedExecution + scanHandle: Long ) extends SharedScanResources { - override def partitionCount: Int = execution.partitionCount() + override def partitionCount: Int = FfiHelperNative.partitionCount(scanHandle) override def newTaskAllocator(name: String): BufferAllocator = allocator.newChildAllocator(name, 0, Long.MaxValue) @@ -47,15 +45,16 @@ private[spark] final class NativeSharedScanResources( override def openPartitionStream( partition: Int, taskAllocator: BufferAllocator): ArrowReader = - execution.executeStream(partition, taskAllocator) + FfiStream.importReader(taskAllocator) { addr => + FfiHelperNative.executeStreamPartition(scanHandle, partition, addr) + } override def close(): Unit = { var first: Throwable = null def safe(f: => Unit): Unit = try f catch { case t: Throwable => if (first == null) first = t else first.addSuppressed(t) } - safe(execution.close()) - safe(ctx.close()) + safe(FfiHelperNative.closeScan(scanHandle)) safe(allocator.close()) if (first != null) throw first } @@ -75,35 +74,22 @@ private[spark] object NativeSharedScanResources extends Logging { .asInstanceOf[FfiProviderFactory] val allocator = new RootAllocator(Long.MaxValue) - var ctx: SessionContext = null try { // Shared mode builds the dataset-wide provider: empty partitionBytes, like the // driver-side schema probe. DataFusion-native partitioning replaces listPartitions. val rawPtr = factory.createProvider(spec.optionsProtoBytes, Array.emptyByteArray) - val widenedPtr = FfiHelperNative.wrapWithWidening(rawPtr) - - ctx = spec.pinnedConfig.buildContext() - ctx.registerFfiTable(DatafusionSqlBuilder.SharedTableName, widenedPtr) - - var df: DataFrame = ctx.sql( - DatafusionSqlBuilder - .buildSql(spec.projectionColumnNames, DatafusionSqlBuilder.SharedTableName)) - var i = 0 - while (i < spec.filterProtoBytes.length) { - val filtered = df.filterFromProto(spec.filterProtoBytes(i)) - df.close() - df = filtered - i += 1 - } - - val execution = df.toPartitionedExecution() - new NativeSharedScanResources(allocator, ctx, execution) + val scanHandle = FfiHelperNative.createScan( + rawPtr, + spec.pinnedConfig.targetPartitions, + spec.pinnedConfig.batchSize, + spec.pinnedConfig.options.map(_._1).toArray, + spec.pinnedConfig.options.map(_._2).toArray, + spec.projectionColumnNames, + spec.filterProtoBytes + ) + new NativeSharedScanResources(allocator, scanHandle) } catch { case t: Throwable => - if (ctx != null) { - try ctx.close() - catch { case suppressed: Throwable => t.addSuppressed(suppressed) } - } try allocator.close() catch { case suppressed: Throwable => t.addSuppressed(suppressed) } throw t diff --git a/spark/src/main/scala/io/datafusion/spark/PinnedSessionConfig.scala b/spark/src/main/scala/io/datafusion/spark/PinnedSessionConfig.scala index af71b4c..7fc21ea 100644 --- a/spark/src/main/scala/io/datafusion/spark/PinnedSessionConfig.scala +++ b/spark/src/main/scala/io/datafusion/spark/PinnedSessionConfig.scala @@ -19,7 +19,6 @@ package io.datafusion.spark -import org.apache.datafusion.SessionContext import org.apache.spark.sql.internal.SQLConf /** @@ -30,8 +29,8 @@ import org.apache.spark.sql.internal.SQLConf * so a plan that yields N partitions on the driver could yield M ≠ N on a differently-sized * executor — and partition-indexed execution would silently drop or duplicate data. The driver * resolves these values once in `DatafusionScanBuilder.build()`, ships them inside every - * [[DatafusionSharedScanPartition]], and both the driver probe and the executors construct their - * `SessionContext` exclusively through [[buildContext]]. + * [[DatafusionSharedScanPartition]], and both the driver probe and the executors hand the same + * values to `FfiHelperNative.createScan`, which builds the native `SessionContext` from them. * * `options` additionally disables the optimizer's plan-reshaping repartition passes so the * physical partitioning is exactly what the provider's `scan()` reports, on every machine. @@ -40,17 +39,7 @@ final case class PinnedSessionConfig( targetPartitions: Int, batchSize: Int, options: Vector[(String, String)] -) extends Serializable { - - def buildContext(): SessionContext = { - val builder = SessionContext - .builder() - .targetPartitions(targetPartitions) - .batchSize(batchSize) - options.foreach { case (k, v) => builder.setOption(k, v) } - builder.build() - } -} +) extends Serializable object PinnedSessionConfig { From 1f73a6fb48e0aeea33b4f0b1a8ab35b66aeac0b5 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 11 Jun 2026 15:40:31 +0200 Subject: [PATCH 12/22] docs: rewrite examples README and move Spark guide into spark/ examples/README.md targets first-time users: prerequisites, two-step build with the why behind install-vs-package and exec:exec, entry points and expected output per example, troubleshooting. SPARK_INTEGRATION.md becomes spark/README.md, rewritten as a connector-builder guide: the three implementation points with file paths, the Spark-task-vs-DataFusion-partition mapping in both scan modes, and task sizing guidance (bin-pack small partitions via opaque partitionBytes, keep per-stage task counts in the low thousands). Also drop product-specific (Rerun) references from doc comments; bridge examples now name neutral domains. Co-Authored-By: Claude Fable 5 --- examples/README.md | 117 +++++-- examples/SPARK_INTEGRATION.md | 274 --------------- examples/native/src/lib.rs | 2 +- .../examples/ExampleFfiProviderFactory.java | 2 +- spark/README.md | 314 ++++++++++++++++++ .../datafusion/spark/FfiProviderFactory.java | 2 +- .../io/datafusion/spark/PartitionInfo.java | 2 +- .../spark/DatafusionInputPartition.scala | 2 +- .../datafusion/spark/DatafusionSource.scala | 7 +- 9 files changed, 403 insertions(+), 319 deletions(-) delete mode 100644 examples/SPARK_INTEGRATION.md create mode 100644 spark/README.md diff --git a/examples/README.md b/examples/README.md index 127c209..6f76c4e 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1,57 +1,102 @@ # DataFusion-Java examples -Self-contained Java programs against the DataFusion-Java API. +Small, self-contained programs that each demonstrate one feature of the +DataFusion-Java API. Every example is a Java class with a `main` method that +builds a query against an in-process DataFusion engine and prints its result +(as tab-separated rows) to stdout. They are the fastest way to see what the +library can do and to copy a working starting point. -`exec:exec` (not `exec:java`) runs each one — the pom shells out to a fresh -`java` process so the JNI library's `--add-opens=java.base/java.nio=ALL-UNNAMED` -JVM flag actually applies. +## Prerequisites -`exec:exec` is a separate Maven invocation from the one that built the -project, so it resolves `datafusion-java` from your local Maven repository -rather than the reactor's `target/` dirs. That means the parent must be -**installed** to the local repo first — `package -am` builds the jar but -does NOT publish it, which surfaces as -`Could not find artifact org.apache.datafusion:datafusion-java:jar:0.2.0-SNAPSHOT`. +- JDK 17+ +- Maven (the repo ships `./mvnw`, no install needed) +- Rust toolchain (`cargo`) — the library calls into a native DataFusion + build, so the Rust side must be compiled once first + +## Build once + +From the repo root: + +```bash +# 1. Compile the native libraries (DataFusion + JNI glue). +cargo build --release + +# 2. Build the Java/Scala modules and install them into your local Maven repo. +./mvnw -B install -DskipTests -Drat.skip=true -Ddatafusion.native.profile=release +``` + +Step 2 must be `install`, not `package`: running an example below starts a +fresh Maven invocation that resolves `datafusion-java` from your local Maven +repository (`~/.m2/repository`), and only `install` publishes the jar there. +If you skip it you'll see +`Could not find artifact org.apache.datafusion:datafusion-java:...` — +that error means "run step 2". + +(If your local Maven repo lives somewhere non-standard, add +`-Dmaven.repo.local=/path/to/repo` to step 2 **and** to every run command.) + +## Run your first example ```bash -# Install the fork into your local Maven repo, then run any example. -mvn -B install -DskipTests -Drat.skip=true \ - -Ddatafusion.native.profile=release -mvn -B -pl examples exec:exec \ - -Dexec.mainClass=org.apache.datafusion.examples. +./mvnw -B -pl examples exec:exec \ + -Dexec.mainClass=org.apache.datafusion.examples.SqlQueryExample ``` -(If your local Maven repo lives somewhere other than `~/.m2/repository`, -add `-Dmaven.repo.local=/path/to/repo` to BOTH invocations.) +This registers a small CSV file, runs a SQL aggregation over it, and prints +the result rows. Swap `SqlQueryExample` for any class in the table below. + +> Why `exec:exec` and not `exec:java`? Each example runs in a fresh `java` +> process so the JVM flag the native Arrow integration needs +> (`--add-opens=java.base/java.nio=ALL-UNNAMED`) actually applies. `exec:java` +> would run inside Maven's own JVM without it. + +## The examples -| Class | What it shows | -| -------------------------------- | --------------------------------------------------------------------------------------------- | -| `SqlQueryExample` | Register a CSV file and run a SQL aggregation. | -| `DataFrameExample` | DataFrame API: filter, group, sort. | -| `ProtoPlanExample` | Build a `LogicalPlanNode` proto in Java, hand it to `SessionContext.fromProto`. | -| `JdbcExample` | Pull from an H2 JDBC source into Arrow, register it, query. | -| `AddOneExample` | Implement a Scalar UDF in Java and register it on the session. | -| `NestedTypeUdfExample` | Scalar UDF over `List` — input + output nested arrow types. | -| `ExampleFfiProviderFactory` | Build an `FFI_TableProvider` in Rust (a `MemTable`) and expose it to Spark through the connector's `FfiProviderFactory` interface. **See: [SPARK_INTEGRATION.md](SPARK_INTEGRATION.md) and the pyspark demo under [`python/`](python/).** | +| Entry point (`-Dexec.mainClass=org.apache.datafusion.examples.<…>`) | Demonstrates | What you'll see | +| --- | --- | --- | +| `SqlQueryExample` | Register a CSV file, run a SQL aggregation | The aggregated rows printed as TSV | +| `DataFrameExample` | The DataFrame API: filter, group, sort — no SQL strings | The transformed rows | +| `ProtoPlanExample` | Build a DataFusion `LogicalPlanNode` protobuf in Java and execute it via `SessionContext.fromProto` — the wire-format path used by query frontends | The plan's result rows | +| `JdbcExample` | Pull rows from a JDBC source (in-memory H2) into Arrow, register them as a table, query them | Rows that originated in H2, queried through DataFusion | +| `AddOneExample` | Write a scalar UDF in Java and call it from SQL | Each input value, plus one | +| `NestedTypeUdfExample` | A scalar UDF whose input and output are nested Arrow types (`List`) | The transformed list column | -## Building the FFI example's cdylib +## The Spark connector example -The FFI provider examples rely on a small Rust cdylib under -[`native/`](native/). It is a member of the repo-root Cargo workspace, so -build it by name from anywhere in the tree: +One example is not a standalone `main`: +`ExampleFfiProviderFactory` implements the Spark connector's +`FfiProviderFactory` interface over a tiny Rust-built in-memory table (the +cdylib under [`native/`](native/)). It exists to be loaded *by Spark* — the +runnable end-to-end version is the PySpark demo under +[`python/`](python/), and the guide to building your own connector is +[`../spark/README.md`](../spark/README.md). + +To build its cdylib (workspace member, buildable from anywhere in the tree): ```bash cargo build -p datafusion-java-ffi-example --release ``` -The example's `System.load` searches the following paths in order: +The factory's `System.load` searches, in order: 1. `-Dexample.ffi.lib.path=/abs/path/to/lib...` (explicit override) -2. `rust-target/release/` (Maven's cwd is the repo root) +2. `rust-target/release/` (cwd = repo root) 3. `rust-target/debug/` -4. `../rust-target/release/` (cwd inside the `examples` module) +4. `../rust-target/release/` (cwd = the `examples` module) 5. `../rust-target/debug/` -Where `` is `libdatafusion_java_ffi_example.so` on Linux, -`libdatafusion_java_ffi_example.dylib` on macOS, or -`datafusion_java_ffi_example.dll` on Windows. +where `` is `libdatafusion_java_ffi_example.so` (Linux), +`libdatafusion_java_ffi_example.dylib` (macOS), or +`datafusion_java_ffi_example.dll` (Windows). + +## Troubleshooting + +- **`Could not find artifact org.apache.datafusion:datafusion-java`** — the + parent wasn't installed to your local Maven repo. Re-run build step 2 + (`install`, not `package`). +- **`Native library not found ...`** — the Rust side wasn't built, or was + built in a different profile than Maven expects. Re-run build step 1 and + keep `-Ddatafusion.native.profile=release` consistent between the cargo + profile (`--release`) and the Maven flag. +- **`UnsatisfiedLinkError ... datafusion_java_ffi_example`** — only the FFI + example's cdylib is missing; see "The Spark connector example" above. diff --git a/examples/SPARK_INTEGRATION.md b/examples/SPARK_INTEGRATION.md deleted file mode 100644 index a658e4d..0000000 --- a/examples/SPARK_INTEGRATION.md +++ /dev/null @@ -1,274 +0,0 @@ -# Using an FFI TableProvider as a Spark Data Source - -The FFI handover is simple: Rust builds an `FFI_TableProvider`, hands the raw -pointer to the JVM, and the JVM passes it to the connector cdylib -(`FfiHelperNative.createScan`), which does everything DataFusion-side in -process — widening, session construction, projection, pushed filters, -planning, and partition streams. - -That flow plugs into Apache Spark as a DataSource V2 by way of the -[`connector-core`](https://github.com/rerun-io/rerun-spark-connector) module -(generic Spark plumbing donated upstream-ready). Below is the recipe for -wiring a domain bridge — e.g. an in-house format or a custom catalog — into -Spark via this pattern. - -## Architecture - -``` -+--------------------------+ +------------------------------+ -| Your bridge cdylib | byte[] opts | Your bridge JVM glue | -| - Rust JNI: | <----+ | - Java POJO + proto encoder | -| createFfiProvider | | | - FfiProviderFactory impl | -| listPartitions | | jlong | - System.load(cdylib) | -| - FFI_TableProvider | <----+----+----+-------- driver / executor | -+--------------------------+ raw ptr +------------------------------+ - | - v -+--------------------------+ +------------------------------+ -| connector-core cdylib | jlong ptr | connector-core JVM | -| - WideningTableProvider | <------------- | - DatafusionSource (DSv2) | -| over arrow::cast | | - SparkPredicateTranslator | -| - createScan: session, | FFI_Arrow- | - ColumnarPartitionReader | -| projection, filters, | ArrayStream | - SharedScanCache | -| plan, exec partitions | -------------> | | -+--------------------------+ +------------------------------+ -``` - -Key invariants: - -- Only the opaque `FFI_TableProvider` pointer crosses the cdylib boundary - (and `FFI_ArrowArrayStream` on the way back). No `SessionContext` is ever - shared, and none exists JVM-side — planning and execution live entirely in - the connector cdylib. -- The connector cdylib widens between your bridge's provider and the scan: - it casts Spark-incompatible Arrow types (UInt*, Float16, - Time*, non-µs Timestamp, recursive List/LargeList/FixedSizeList) using - kernel-level `arrow::compute::cast`. No SQL, no view rewrites. -- Predicate pushdown crosses the FFI boundary as a `LogicalExprNode` proto - (datafusion-proto). Spark translates V2 `Predicate`s and ships the bytes; - the producer's `TableProviderFilterPushDown::scan(...)` sees them as Rust - `Expr`s. - -## Producer side (Rust) - -Your bridge cdylib exposes a `createFfiProvider` JNI entrypoint that decodes -your domain proto, builds an `Arc`, and wraps it in -`FFI_TableProvider`. This is exactly what -[`examples/native/src/lib.rs`](native/src/lib.rs) does for a `MemTable`. For -a real bridge, replace the `MemTable` with your own `TableProvider` -implementation: - -```rust -let provider: Arc = runtime().block_on(build_provider(opts))?; -let ffi = FFI_TableProvider::new( - provider, - /*can_support_pushdown_filters=*/ true, - Some(runtime().clone()), - FFI_TaskContextProvider::from(&ctx_provider), // throwaway local SessionContext - /*logical_codec=*/ None, // default DataFusion codec -); -Box::into_raw(Box::new(ffi)) as jlong -``` - -Driver-side partition enumeration goes through a second JNI entrypoint -`listPartitions(options_proto_bytes) -> PartitionInfo[]`. One Spark task gets -created per returned entry. Each `PartitionInfo` carries: - -- `id` — stable, human-readable partition identifier (surfaces in Spark UI/logs). -- `partitionBytes` — opaque per-partition payload, replayed into - `createProvider(opts, partitionBytes)` so the executor materialises *this* - slice. Empty array = no per-partition state. -- `preferredLocations` — hostnames where this slice's data lives. Spark uses - these (subject to `spark.locality.wait`) to co-locate the task with the - data — e.g. four partitions per worker on a 3-worker cluster. - -## JVM glue - -Implement `io.datafusion.spark.FfiProviderFactory` (from -[`connector-core`](https://github.com/rerun-io/rerun-spark-connector/blob/main/connector-core/src/main/java/io/datafusion/spark/FfiProviderFactory.java)). -Must be no-arg constructable so executors can instantiate it via -`Class.forName(...).getDeclaredConstructor().newInstance()`. - -```java -public final class MyBridgeProviderFactory implements FfiProviderFactory { - - @Override - public byte[] encodeOptions(Map sparkOptions) { - // Translate Spark options ("url", "table", ...) into your proto. - return MyBridgeOptions.fromMap(sparkOptions).toProtoBytes(); - } - - @Override - public PartitionInfo[] listPartitions(byte[] optionsProtoBytes) { - // Bridge enumerates slices and resolves their host placement: - // record MySlice(String id, byte[] payload, String[] hosts) {} - MySlice[] slices = MyBridgeNative.listSlices(optionsProtoBytes); - PartitionInfo[] out = new PartitionInfo[slices.length]; - for (int i = 0; i < slices.length; i++) { - out[i] = new PartitionInfo(slices[i].id(), slices[i].payload(), slices[i].hosts()); - } - return out; - } - - @Override - public long createProvider(byte[] optionsProtoBytes, byte[] partitionBytes) { - // partitionBytes is the same payload returned from listPartitions for *this* task. - // The driver-side schema probe passes an empty array; honour it. - return MyBridgeNative.createFfiProvider(optionsProtoBytes, partitionBytes); - } - - @Override - public ReportedPartitioning reportPartitioning(byte[] optionsProtoBytes) { - // Optional. Return non-null only when each PartitionInfo's rows all share the same - // key tuple under the declared transforms — Spark elides shuffles ahead of joins/aggs - // grouped on those keys. Return null when the layout is unknown or row-key mapping - // would be lossy. - return ReportedPartitioning.identity("device_id"); - // or: ReportedPartitioning.bucket(numBuckets, "user_id"); - } -} -``` - -## Wiring it into Spark - -Two paths, pick one: - -### Option A — config option per use - -```python -df = (spark.read.format("datafusion") - .option("df.factory", "com.example.MyBridgeProviderFactory") - .option("url", "rerun+http://localhost:51234") - .option("table", "my_dataset") - .load()) -df.printSchema() -df.filter("ts > 1700000000").show() -``` - -### Option B — thin shim with a short name - -Mirror the -[`rerun-connector`](https://github.com/rerun-io/rerun-spark-connector/blob/main/rerun-connector/src/main/scala/io/rerun/spark/RerunDataSource.scala) -shim — a ~20-line subclass that bakes the factory FQCN in: - -```scala -class MyDataSource extends DatafusionSource { - override def shortName(): String = "my_format" - override protected def factoryFqcn(opts: CaseInsensitiveStringMap): String = - "com.example.MyBridgeProviderFactory" -} -``` - -Register via `META-INF/services/org.apache.spark.sql.sources.DataSourceRegister`, -then: - -```python -df = (spark.read.format("my_format") - .option("url", "...") - .option("table", "...") - .load()) -``` - -## What runs where - -| Phase | Where | Path | -| --------------------------- | --------- | ---- | -| `inferSchema` | Driver | `factory.encodeOptions` → `factory.createProvider(opts, EMPTY)` → `FfiHelperNative.providerSchemaIpc` (widens, returns Arrow IPC schema) | -| `ScanBuilder.build` | Driver | `factory.listPartitions(optionsBytes, filterBytes)` (filter-aware overload — bridges can prune partitions; cached on Scan) + `factory.reportPartitioning(optionsBytes)` (cached on Scan) | -| `outputPartitioning` | Driver | `KeyGroupedPartitioning(reported.keys, partitions.length)` when bridge declared one; `UnknownPartitioning(partitions.length)` otherwise. Spark may elide shuffles when keys line up with downstream join/agg grouping. | -| `planInputPartitions` | Driver | Reuses the cached `PartitionInfo[]`; one task per entry with that entry's `partitionBytes` + `preferredLocations` | -| Predicate translation | Driver | `SparkPredicateTranslator.translate(Predicate)` → `LogicalExprNode` proto bytes (each pushed predicate is independent) | -| Per-task scan | Executor | Same factory → `createProvider(opts, partitionBytes)` → `FfiHelperNative.createScan` (widen, projection, pushed proto filters, plan) → `executeStream` | - -## Partition key values (`HasPartitionKey`) - -Declaring `reportPartitioning` alone is NOT enough on Spark 3.3+: Spark's -`DataSourceV2ScanExecBase.groupPartitions` only consumes the declared -`KeyGroupedPartitioning` when every input partition also implements -`HasPartitionKey`. To activate it, return the key values per partition via -`PartitionInfo`'s 4-argument constructor: - -```java -new PartitionInfo(slice.id(), slice.payload(), slice.hosts(), - new Object[] {slice.segmentId()}); // matches identity("segment_id") -``` - -Rules: all partitions carry keys or none (mixed state fails the scan -driver-side); array arity must equal the declared key count; values must be -`CatalystTypeConverters`-convertible Java types (`String`, `Long`, -`java.time.Instant`, `java.time.LocalDate`, `java.math.BigDecimal`, ...). -Storage-partitioned joins additionally require -`spark.sql.sources.v2.bucketing.enabled=true`. - -## Shared-scan mode - -The default model above builds one provider per Spark task. For datasets with -thousands of small partitions — or providers whose construction is expensive -(remote metadata, connection setup) — the per-task fixed cost dominates. -Shared-scan mode flips the mapping: the bridge's provider is built ONCE per -(executor JVM × query) with empty `partitionBytes`, planned once, and Spark -runs one task per *DataFusion-native* output partition; task `i` streams plan -partition `i` from the cached plan. - -Opt in per dataset from the factory: - -```java -@Override -public boolean sharedScan(byte[] optionsProtoBytes) { - return MyBridgeOptions.fromProtoBytes(optionsProtoBytes).useSharedScan(); -} -``` - -What changes: - -| Phase | Where | Path | -| ---------------------- | -------- | ---- | -| `ScanBuilder.build` | Driver | mint `scanId` (UUID) + pin session config → probe build (same code path as executors) → physical plan partition count `N` → `N` tasks | -| `outputPartitioning` | Driver | always `UnknownPartitioning(N)` — DataFusion partitions carry no key contract; `listPartitions` / `reportPartitioning` are not called | -| Per-task scan | Executor | `SharedScanCache.acquire(scanId)` → (first task only) `createProvider(opts, EMPTY)` → `FfiHelperNative.createScan` with the pinned config (widen, projection, filters, plan once) → every task `executeStreamPartition(partitionIndex)` → release | - -Cache semantics: entries are keyed by `scanId` (per query — separate actions -build separate entries), refcounted by open readers, and evicted after an idle -TTL. Build failures are not cached; eviction between task waves just rebuilds. - -Spark conf (all read driver-side at planning time and shipped to executors): - -- `spark.datafusion.sharedScan.targetPartitions` (default 8) — pinned - DataFusion `target_partitions`. Any constant works; it must merely be the - same everywhere, which shipping guarantees. -- `spark.datafusion.sharedScan.batchSize` (default 8192) -- `spark.datafusion.sharedScan.idleTtlMs` (default 120000) — cache idle - eviction window. - -**Determinism contract** (the price of admission — see -`FfiProviderFactory.sharedScan` Javadoc): the provider's schema, partitioning, -and per-partition contents must be a pure function of `optionsProtoBytes`. -Remote sources must pin a snapshot (version/timestamp) inside the options. -The connector fails tasks when an executor's partition count diverges from the -driver's, but equal counts with different contents are undetectable. The -provider's `ExecutionPlan` must also tolerate `execute(i)` being called more -than once per plan instance (task retry / speculative execution). - -Choosing a model: - -- **Per-partition payload (default)** — slices have host affinity - (`preferredLocations`), per-slice provider construction is cheap, or you - want `KeyGroupedPartitioning` + `HasPartitionKey` semantics. Bin-pack many - small slices into fewer `PartitionInfo` entries via `partitionBytes` (it is - opaque — encode a list of slice ids) before reaching for shared-scan. -- **Shared-scan** — thousands of small partitions, expensive - `createProvider`, no locality story, scan+filter+projection workloads. - Provider builds drop from one-per-task to one-per-executor (plus one driver - probe per query). - -## Caveats - -- One `FFI_LogicalExtensionCodec` per provider — v1 uses - `DefaultLogicalExtensionCodec`. If your bridge serializes custom - `LogicalNode`s, swap the codec at `FFI_TableProvider::new` time. -- Each cdylib brings its own Tokio runtime and (for TLS-using bridges) its - own rustls install. Both should be `Once`-gated. -- The widening cdylib in `connector-core` covers top-level scalars + List - children. Nested unsigned inside `Struct`/`Map` still surfaces the raw - Arrow type to Spark and fails at column-vector accessor time. Extend - `arrow_cast_widening` if you hit this. diff --git a/examples/native/src/lib.rs b/examples/native/src/lib.rs index 8b861bb..5cfbee3 100644 --- a/examples/native/src/lib.rs +++ b/examples/native/src/lib.rs @@ -21,7 +21,7 @@ //! `FfiHelperNative.createScan` / `providerSchemaIpc`, which widen the //! provider and plan/execute the scan inside the connector cdylib. //! -//! The same pattern is what domain bridges (Rerun, HDF5, custom Iceberg) use +//! The same pattern is what domain bridges (HDF5, custom Iceberg, in-house formats) use //! to expose their TableProviders to Spark via the connector-core DataSource //! V2 plumbing. //! diff --git a/examples/src/main/java/org/apache/datafusion/examples/ExampleFfiProviderFactory.java b/examples/src/main/java/org/apache/datafusion/examples/ExampleFfiProviderFactory.java index 561544f..3059830 100644 --- a/examples/src/main/java/org/apache/datafusion/examples/ExampleFfiProviderFactory.java +++ b/examples/src/main/java/org/apache/datafusion/examples/ExampleFfiProviderFactory.java @@ -58,7 +58,7 @@ * (single task via {@link #listPartitions(byte[])}). * * - *

    Real bridges (Rerun, HDF5, custom Iceberg) use a protobuf schema for {@code + *

    Real bridges (HDF5, custom Iceberg, in-house formats) use a protobuf schema for {@code * optionsProtoBytes}; this example uses a hand-rolled length-prefixed binary format to keep the * wire layer obvious: * diff --git a/spark/README.md b/spark/README.md new file mode 100644 index 0000000..a251839 --- /dev/null +++ b/spark/README.md @@ -0,0 +1,314 @@ +# DataFusion Spark Connector + +This module (`datafusion-java-spark`) lets you expose a [DataFusion +`TableProvider`](https://docs.rs/datafusion/latest/datafusion/catalog/trait.TableProvider.html) +written in Rust as an [Apache Spark DataSource +V2](https://spark.apache.org/docs/latest/sql-data-sources.html) table. If you +have data that DataFusion can already read — an in-house file format, a custom +catalog, a remote service — this connector is the bridge that makes +`spark.read.format(...)` work against it, with predicate pushdown, column +pruning, and partitioned parallel reads. + +You write two small pieces (a Rust function and a Java class); the connector +supplies everything else. + +## How it fits together + +Three layers, two of which already exist: + +``` + your bridge (you write this) this module (already written) ++--------------------------------+ +----------------------------------+ +| Rust cdylib | | connector cdylib (spark/native) | +| builds your TableProvider, | | type widening, session setup, | +| wraps it as FFI_TableProvider|-->| projection, filters, planning, | +| | | partition streams | +| Java FfiProviderFactory | | Scala/Java DSv2 plumbing | +| turns Spark options into | | (spark/src) schema inference, | +| bytes, hands pointers across |-->| pushdown, task planning, | +| | | shared-scan cache | ++--------------------------------+ +----------------------------------+ + | + v + spark.read.format("...").load() +``` + +The only things that cross between your Rust code and the connector are: + +- an opaque `FFI_TableProvider` pointer (your provider, handed over as a + `long`), and +- opaque `byte[]` blobs that *you* define (your options and per-partition + payloads — the connector never inspects them). + +Everything DataFusion-side (planning, filter application, execution) happens +inside the connector's native library. There is no DataFusion session on the +JVM side at all. + +## What you implement + +| # | Piece | Language | Contract lives at | Working example | +|---|-------|----------|-------------------|-----------------| +| 1 | A JNI entry point that builds your `TableProvider` and returns a raw `FFI_TableProvider` pointer | Rust | — (plain `#[no_mangle]` JNI fn) | [`examples/native/src/lib.rs`](../examples/native/src/lib.rs) | +| 2 | An `FfiProviderFactory` implementation | Java | [`src/main/java/io/datafusion/spark/FfiProviderFactory.java`](src/main/java/io/datafusion/spark/FfiProviderFactory.java) | [`examples/.../ExampleFfiProviderFactory.java`](../examples/src/main/java/org/apache/datafusion/examples/ExampleFfiProviderFactory.java) | +| 3 | (optional) A `DatafusionSource` subclass giving your source a short name | Scala/Java | [`src/main/scala/io/datafusion/spark/DatafusionSource.scala`](src/main/scala/io/datafusion/spark/DatafusionSource.scala) | see "Wiring it into Spark" below | + +An end-to-end runnable version of all three — in-memory table, factory, and a +PySpark script that scans, filters, and projects it — lives under +[`examples/python/`](../examples/python/). + +### 1. The Rust side + +One JNI function: decode your options bytes, build an +`Arc`, wrap it: + +```rust +let provider: Arc = runtime().block_on(build_provider(opts))?; +let ffi = FFI_TableProvider::new( + provider, + /*can_support_pushdown_filters=*/ true, + Some(runtime().clone()), + FFI_TaskContextProvider::from(&ctx_provider), // throwaway local SessionContext + /*logical_codec=*/ None, // default DataFusion codec +); +Box::into_raw(Box::new(ffi)) as jlong +``` + +Ownership of the pointer transfers to whoever you hand it to (the factory +passes it straight into the connector). [`examples/native/src/lib.rs`](../examples/native/src/lib.rs) +is a complete, commented version of this for a `MemTable`. + +### 2. The Java factory + +`FfiProviderFactory` is the contract between Spark and your bridge. It must +have a no-arg constructor (executors instantiate it reflectively by class +name). Three methods are required: + +```java +public final class MyBridgeProviderFactory implements FfiProviderFactory { + + /** Translate Spark options ("url", "table", ...) into your own bytes. */ + @Override + public byte[] encodeOptions(Map sparkOptions) { + return MyBridgeOptions.fromMap(sparkOptions).toProtoBytes(); + } + + /** Enumerate the slices of the dataset; one Spark task is created per entry. */ + @Override + public PartitionInfo[] listPartitions(byte[] optionsProtoBytes) { + MySlice[] slices = MyBridgeNative.listSlices(optionsProtoBytes); + PartitionInfo[] out = new PartitionInfo[slices.length]; + for (int i = 0; i < slices.length; i++) { + out[i] = new PartitionInfo(slices[i].id(), slices[i].payload(), slices[i].hosts()); + } + return out; + } + + /** Build the provider for one slice. Called with EMPTY partitionBytes for + * the driver-side schema probe — schema must not depend on the slice. */ + @Override + public long createProvider(byte[] optionsProtoBytes, byte[] partitionBytes) { + return MyBridgeNative.createFfiProvider(optionsProtoBytes, partitionBytes); + } +} +``` + +The optional methods — `sharedScan`, `reportPartitioning`, and the +filter-aware `listPartitions(opts, filters)` overload — are covered in their +own sections below. Their javadoc in +[`FfiProviderFactory.java`](src/main/java/io/datafusion/spark/FfiProviderFactory.java) +is the authoritative contract. + +### 3. Wiring it into Spark + +Either pass your factory class per read: + +```python +df = (spark.read.format("datafusion") + .option("df.factory", "com.example.MyBridgeProviderFactory") + .option("url", "...") + .option("table", "my_dataset") + .load()) +``` + +or ship a ~10-line subclass so users get a short format name: + +```scala +class MyDataSource extends DatafusionSource { + override def shortName(): String = "my_format" + override protected def factoryFqcn(opts: CaseInsensitiveStringMap): String = + "com.example.MyBridgeProviderFactory" +} +``` + +registered via a +`META-INF/services/org.apache.spark.sql.sources.DataSourceRegister` file +(this module registers `datafusion` the same way — see +[`src/main/resources/META-INF/services/`](src/main/resources/META-INF/services/)). + +## Spark tasks vs. DataFusion partitions + +This is the most important design decision when building a connector, so it +gets its own section. + +Spark parallelism and DataFusion parallelism are different things: + +- A **Spark task** is the unit Spark schedules onto an executor core. Each + task carries fixed overhead: scheduling on the driver, (de)serializing the + task, instantiating your factory, building a provider, planning a scan. +- A **DataFusion partition** is one output stream of a planned physical + query. A single plan usually has several. + +The connector supports two ways of mapping one onto the other: + +### Default mode: one Spark task per `PartitionInfo` + +`listPartitions` returns N entries → Spark runs N tasks. Each task calls +`createProvider(opts, partitionBytes)` with *its own* entry's payload, so each +task plans and scans only its slice. If DataFusion happens to plan that slice +into multiple internal partitions, they are merged into one stream for the +task — within a task there is no extra parallelism, by design (the +parallelism budget belongs to Spark). + +You control the mapping entirely through what you return from +`listPartitions`. Sizing guidance: + +- **Don't emit one `PartitionInfo` per tiny fragment.** A Spark task should + do meaningfully more work than its overhead — as a rule of thumb at least + ~100 ms of scan time, or order-100 MB of data (Spark's own file sources + default to 128 MB per task for the same reason). If your natural unit is a + small chunk (an object-store key, a time slice, a recording segment), + **bin-pack several into one entry**: `partitionBytes` is opaque, so encode + a *list* of chunk ids and have your `createProvider` materialise all of + them in one provider. +- **Watch the total task count.** The Spark driver schedules and tracks every + task; beyond the low thousands of tasks per stage you pay growing driver + CPU/memory and UI lag for no extra throughput once the cluster's cores are + saturated. A healthy target is roughly 2–3 tasks per available core, and + rarely more than a few thousand per scan. Tens of thousands of + single-digit-megabyte tasks is a smell — bin-pack first. +- **Locality and partition keys only exist here.** `preferredLocations` + (host affinity) and `HasPartitionKey`/`reportPartitioning` (shuffle + elision) are properties of `PartitionInfo` entries. If you need either, + use this mode. + +### Shared-scan mode: one Spark task per DataFusion partition + +When provider construction itself is expensive (remote metadata, connection +setup) or the dataset has thousands of small natural partitions, per-task +provider builds dominate. Opting in via + +```java +@Override +public boolean sharedScan(byte[] optionsProtoBytes) { return true; } +``` + +flips the mapping: the provider is built **once per executor JVM per query** +(with empty `partitionBytes`), planned once, and Spark runs one task per +*DataFusion output partition* — task `i` streams plan partition `i` from the +executor-local cached plan. `listPartitions` is not called at all. + +The DataFusion partition count — and therefore the Spark task count — is +pinned by `spark.datafusion.sharedScan.targetPartitions` (default 8). The +value is resolved on the driver and shipped to executors, because +DataFusion's default would vary with each machine's core count and the +partition indices must mean the same thing everywhere. + +Choosing between the modes: + +| Choose | When | +|--------|------| +| Default (per-partition payload) | slices have host affinity, you want partition-key semantics, per-slice provider construction is cheap. Bin-pack small slices before abandoning this mode. | +| Shared-scan | provider construction is expensive, there are thousands of small partitions with no locality story, the workload is scan + filter + projection. Provider builds drop from one-per-task to one-per-executor (plus one driver probe per query). | + +Shared-scan's price of admission is a **determinism contract**: the +provider's schema, partitioning, and per-partition contents must be a pure +function of `optionsProtoBytes`. Remote sources must pin a snapshot +(version/timestamp) inside the options. The connector fails tasks when an +executor's partition count diverges from the driver's, but equal counts with +different contents are undetectable by construction. The provider's +`ExecutionPlan` must also tolerate `execute(i)` being called more than once +per plan instance (Spark retries and speculatively re-executes tasks). Full +contract: `FfiProviderFactory.sharedScan` javadoc. + +Shared-scan operational details: + +- Executor cache ([`SharedScanCache.scala`](src/main/scala/io/datafusion/spark/SharedScanCache.scala)): + entries keyed per query (`scanId`), refcounted by open readers, evicted + after an idle TTL. Build failures are not cached; eviction between task + waves just rebuilds. +- Spark conf (read on the driver at planning time, shipped to executors): + - `spark.datafusion.sharedScan.targetPartitions` (default 8) + - `spark.datafusion.sharedScan.batchSize` (default 8192) + - `spark.datafusion.sharedScan.idleTtlMs` (default 120000) + +## What the connector does for you + +- **Schema inference** — your provider's Arrow schema, widened, becomes the + Spark schema. Driver-side, one probe build with empty `partitionBytes`. +- **Type widening** — Spark's columnar readers reject several Arrow types + DataFusion happily produces. The connector cdylib transparently casts + unsigned ints → wider signed, `Float16` → `Float32`, `Time*` → wider ints, + any-unit/tz `Timestamp` → microsecond, recursively through + `List`/`LargeList`/`FixedSizeList` (see + [`native/src/widening.rs`](native/src/widening.rs)). Caveat: unsigned types + nested inside `Struct`/`Map` are not yet covered. +- **Predicate pushdown** — Spark V2 `Predicate`s are translated to DataFusion + expressions ([`SparkPredicateTranslator.scala`](src/main/scala/io/datafusion/spark/SparkPredicateTranslator.scala)), + shipped as `datafusion-proto` bytes, and applied inside the native plan, so + your provider's `supports_filters_pushdown`/`scan` sees real Rust `Expr`s. + Anything untranslatable stays in Spark as a residual filter — over-claiming + is impossible by construction. +- **Column pruning** — Spark's required-columns projection becomes a + DataFusion projection on the native plan. +- **Partition-aware joins/aggregations** (default mode, optional) — declare + `reportPartitioning` + per-partition key values and Spark can elide + shuffles. See the javadoc on + [`ReportedPartitioning.java`](src/main/java/io/datafusion/spark/ReportedPartitioning.java) + and [`PartitionInfo.java`](src/main/java/io/datafusion/spark/PartitionInfo.java); + note Spark 3.3+ additionally requires + `spark.sql.sources.v2.bucketing.enabled=true` for storage-partitioned + joins. + +## What runs where + +| Phase | Where | Path | +| ----- | ----- | ---- | +| Schema inference | Driver | `factory.encodeOptions` → `factory.createProvider(opts, EMPTY)` → connector cdylib widens + returns the Arrow schema | +| Scan planning (default mode) | Driver | `factory.listPartitions(opts[, filters])` → one task per entry, with its `partitionBytes` + `preferredLocations` | +| Scan planning (shared-scan) | Driver | probe build (same code path executors use) → plan partition count `N` → `N` tasks | +| Predicate translation | Driver | `SparkPredicateTranslator` → proto bytes per pushed predicate | +| Per-task scan (default mode) | Executor | `createProvider(opts, partitionBytes)` → `FfiHelperNative.createScan` (widen, project, filter, plan) → stream whole plan | +| Per-task scan (shared-scan) | Executor | cache-acquire by `scanId` (first task builds) → stream plan partition `i` → release | + +The JNI surface backing all of this is +[`FfiHelperNative.java`](src/main/java/io/datafusion/spark/FfiHelperNative.java) +/ [`native/src/scan.rs`](native/src/scan.rs). + +## Module layout + +``` +spark/ +├── src/main/java/io/datafusion/spark/ public SPI + JNI boundary (Java on +│ purpose: bridge jars stay Scala-free) +│ FfiProviderFactory.java <- the contract you implement +│ PartitionInfo.java <- one entry = one Spark task +│ ReportedPartitioning.java <- optional shuffle-elision declaration +│ FfiHelperNative.java <- JNI into the connector cdylib +├── src/main/scala/io/datafusion/spark/ connector internals (DSv2 wiring, +│ readers, pushdown, shared-scan cache) +└── native/ connector cdylib (widening + scan + planning/execution, Rust) +``` + +## Caveats + +- One logical-extension codec per provider — the connector uses DataFusion's + default codec when deserializing pushed filter expressions, which covers + columns, literals, and built-in functions. Bridges whose providers + round-trip custom `LogicalNode`s need a custom codec at + `FFI_TableProvider::new` time. +- Each cdylib brings its own Tokio runtime and (for TLS-using bridges) its + own rustls install. Both should be `Once`-gated in your bridge. +- The connector and your bridge must agree on the `datafusion-ffi` ABI — + build both against the same DataFusion major version (this repo pins it in + the workspace [`Cargo.toml`](../Cargo.toml)). diff --git a/spark/src/main/java/io/datafusion/spark/FfiProviderFactory.java b/spark/src/main/java/io/datafusion/spark/FfiProviderFactory.java index 2003016..df529c2 100644 --- a/spark/src/main/java/io/datafusion/spark/FfiProviderFactory.java +++ b/spark/src/main/java/io/datafusion/spark/FfiProviderFactory.java @@ -22,7 +22,7 @@ import java.util.Map; /** - * Bridge interface implemented per domain (Rerun, HDF5, custom Iceberg, etc.). A bridge owns its + * Bridge interface implemented per domain (HDF5, custom Iceberg, an in-house format, etc.). A bridge owns its * own proto schema for connection options and a cdylib that produces an {@code FFI_TableProvider} * pointer. The connector-core Spark plumbing is generic — it knows only this interface. * diff --git a/spark/src/main/java/io/datafusion/spark/PartitionInfo.java b/spark/src/main/java/io/datafusion/spark/PartitionInfo.java index bdd990d..522d4e2 100644 --- a/spark/src/main/java/io/datafusion/spark/PartitionInfo.java +++ b/spark/src/main/java/io/datafusion/spark/PartitionInfo.java @@ -28,7 +28,7 @@ *

    Fields: * *

      - *
    • {@code id} — stable, human-readable identifier for this partition (e.g. a Rerun segment + *
    • {@code id} — stable, human-readable identifier for this partition (e.g. a segment * id). Surfaces in Spark UI, logs, and exception messages. Must be non-empty. *
    • {@code partitionBytes} — opaque per-partition payload. Bridge encodes whatever the executor * needs to materialise *this* slice (offsets, row ranges, sub-options, etc.). Combined with diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionInputPartition.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionInputPartition.scala index 9fbe070..2b221e5 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionInputPartition.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionInputPartition.scala @@ -39,7 +39,7 @@ sealed trait DatafusionPartition extends InputPartition * - `projectionColumnNames`: pruned column list (post-`pruneColumns`). * - `filterProtoBytes`: V2 `Predicate` → DataFusion `LogicalExprNode` proto bytes; each one is * applied natively via `FfiHelperNative.createScan`. - * - `partitionId`: stable identifier (e.g. Rerun segment id) — surfaces in Spark UI/logs/errors. + * - `partitionId`: stable identifier (e.g. a segment or file id) — surfaces in Spark UI/logs/errors. * - `partitionBytes`: opaque per-partition payload from `PartitionInfo.partitionBytes`. Passed * back into `createProvider` so the bridge materialises *this* slice. * - `preferredLocs`: hostnames where this partition's data lives; returned from diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionSource.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionSource.scala index fd1d66c..58a5884 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionSource.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionSource.scala @@ -33,7 +33,7 @@ import org.apache.spark.sql.util.CaseInsensitiveStringMap /** * Generic Spark DataSource V2 entry point. Concrete bridges either: - * - Subclass and override [[shortName]] + [[factoryFqcn]] (the rerun-connector pattern), or + * - Subclass and override [[shortName]] + [[factoryFqcn]] (the short-name shim pattern), or * - Use this class directly with `option("df.factory", "fully.qualified.FactoryClass")`. * * Schema discovery happens driver-side inside the connector cdylib: the factory's @@ -50,9 +50,8 @@ class DatafusionSource extends TableProvider with DataSourceRegister { protected val FactoryOptionKey: String = "df.factory" /** - * Resolve the bridge factory class name from the Spark options. Subclasses (e.g. - * `RerunDataSource`) override to return a hard-coded FQCN so users don't need to set - * `df.factory` themselves. + * Resolve the bridge factory class name from the Spark options. Subclasses override to return a + * hard-coded FQCN so users don't need to set `df.factory` themselves. */ protected def factoryFqcn(options: CaseInsensitiveStringMap): String = { val v = options.get(FactoryOptionKey) From e9f3f612b746bc190ba1e1ca51b8b77f6a5f8b59 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 11 Jun 2026 16:26:50 +0200 Subject: [PATCH 13/22] feat(spark): add datafusion-spark-bridge SDK for static bridges Bridges that own their provider's source shouldn't pay the FFI_TableProvider hop or the datafusion-ffi ABI lockstep it forces. The new spark/bridge rlib carries the widening + scan machinery, provider-source-agnostic: each JNI body takes a closure that supplies the provider. export_bridge! generates the full JNI surface for a bridge's own cdylib under a bridge-chosen class name, so several bridges can coexist in one Spark JVM; the builder receives the raw option/partition bytes and returns a concrete Arc. - datafusion-ffi import lives behind a default-on `ffi` feature; static bridges build with no-default-features and drop the dependency entirely - datafusion-spark-helper shrinks to JNI shims for the generic io.datafusion.spark.FfiHelperNative path (symbol set unchanged) - document the FFI_TaskContextProvider lifetime contract: the host SessionContext must outlive every provider built from it JVM-side dispatch to bridge-named native classes is a follow-up; until then export_bridge! cdylibs build but the connector still routes through FfiHelperNative. Co-Authored-By: Claude Fable 5 --- Cargo.lock | 10 +- Cargo.toml | 1 + examples/native/src/lib.rs | 6 +- spark/README.md | 73 +++++++- spark/bridge/Cargo.toml | 42 +++++ spark/bridge/src/ffi.rs | 43 +++++ spark/bridge/src/lib.rs | 226 +++++++++++++++++++++++ spark/{native => bridge}/src/scan.rs | 190 +++++++++---------- spark/{native => bridge}/src/widening.rs | 0 spark/bridge/tests/export_macro.rs | 52 ++++++ spark/native/Cargo.toml | 20 +- spark/native/src/lib.rs | 103 +++++++++-- 12 files changed, 615 insertions(+), 151 deletions(-) create mode 100644 spark/bridge/Cargo.toml create mode 100644 spark/bridge/src/ffi.rs create mode 100644 spark/bridge/src/lib.rs rename spark/{native => bridge}/src/scan.rs (62%) rename spark/{native => bridge}/src/widening.rs (100%) create mode 100644 spark/bridge/tests/export_macro.rs diff --git a/Cargo.lock b/Cargo.lock index 4a7b53d..ce9e1a0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1682,7 +1682,7 @@ dependencies = [ ] [[package]] -name = "datafusion-spark-helper" +name = "datafusion-spark-bridge" version = "0.1.0" dependencies = [ "arrow", @@ -1697,6 +1697,14 @@ dependencies = [ "tokio", ] +[[package]] +name = "datafusion-spark-helper" +version = "0.1.0" +dependencies = [ + "datafusion-spark-bridge", + "jni", +] + [[package]] name = "datafusion-sql" version = "53.1.0" diff --git a/Cargo.toml b/Cargo.toml index c9f0f58..0597e55 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,6 +21,7 @@ members = [ "native", "native-common", "examples/native", + "spark/bridge", "spark/native", ] diff --git a/examples/native/src/lib.rs b/examples/native/src/lib.rs index 5cfbee3..756618e 100644 --- a/examples/native/src/lib.rs +++ b/examples/native/src/lib.rs @@ -68,8 +68,10 @@ fn runtime() -> &'static Handle { .handle() } -/// Throwaway `SessionContext` used only to obtain a `TaskContextProvider` -/// for `FFI_TableProvider::new`. The example does not register anything on it. +/// Host `SessionContext` used only to obtain a `TaskContextProvider` for +/// `FFI_TableProvider::new`. Static on purpose: the `FFI_TaskContextProvider` +/// holds a non-owning reference, so this context must outlive every provider +/// built from it. Nothing is ever registered on it. fn host_session_context() -> &'static Arc { use std::sync::OnceLock; static CTX: OnceLock> = OnceLock::new(); diff --git a/spark/README.md b/spark/README.md index a251839..4f10fb8 100644 --- a/spark/README.md +++ b/spark/README.md @@ -58,23 +58,76 @@ PySpark script that scans, filters, and projects it — lives under ### 1. The Rust side -One JNI function: decode your options bytes, build an -`Arc`, wrap it: +Two ways to build your cdylib. **Static (preferred when you own the +provider's source):** depend on the [`datafusion-spark-bridge`](bridge/) +SDK crate and let it generate the JNI surface — no `FFI_TableProvider`, no +`datafusion-ffi` ABI coupling, one cdylib, your choice of DataFusion version: ```rust +use std::sync::Arc; +use datafusion_spark_bridge::datafusion::catalog::TableProvider; +use datafusion_spark_bridge::{export_bridge, BridgeContext, JniResult}; + +fn build_provider( + ctx: &BridgeContext, + options: &[u8], + partition: &[u8], +) -> JniResult> { + let opts = MyOptions::decode(options)?; + Ok(ctx.block_on(MyProvider::connect(opts, partition))?) +} + +export_bridge! { + // Underscore-mangled name of YOUR Java class declaring the native + // methods (dots -> underscores). Per-bridge names let several bridges + // coexist in one Spark JVM. + jni_class: "com_example_mybridge_BridgeNative", + build_provider: build_provider, +} +``` + +The macro's rustdoc lists the exact `static native` method set the named +Java class must declare. (JVM-side plumbing that routes the connector to a +bridge-named native class instead of the generic helper is the next step on +this path.) + +**FFI (when the provider arrives precompiled, or must stay on a different +DataFusion version):** one JNI function that decodes your options bytes, +builds an `Arc`, and wraps it: + +```rust +/// Host SessionContext for FFI_TableProvider::new's task-context plumbing. +/// MUST outlive every provider built from it — the FFI_TaskContextProvider +/// holds a non-owning reference, and the connector calls back through it on +/// every scan. Keep it in a static; a function-local context dropped after +/// this call leaves the provider with a dangling task-context source. +fn host_session_context() -> &'static Arc { + static CTX: OnceLock> = OnceLock::new(); + CTX.get_or_init(|| Arc::new(SessionContext::new())) +} + let provider: Arc = runtime().block_on(build_provider(opts))?; +let ctx_provider: Arc = + Arc::clone(host_session_context()) as Arc; let ffi = FFI_TableProvider::new( provider, /*can_support_pushdown_filters=*/ true, Some(runtime().clone()), - FFI_TaskContextProvider::from(&ctx_provider), // throwaway local SessionContext - /*logical_codec=*/ None, // default DataFusion codec + FFI_TaskContextProvider::from(&ctx_provider), + /*logical_codec=*/ None, // default DataFusion codec ); Box::into_raw(Box::new(ffi)) as jlong ``` -Ownership of the pointer transfers to whoever you hand it to (the factory -passes it straight into the connector). [`examples/native/src/lib.rs`](../examples/native/src/lib.rs) +Two lifetime rules: + +- Ownership of the returned pointer transfers to whoever you hand it to (the + factory passes it straight into the connector). +- The `SessionContext` behind the `FFI_TaskContextProvider` must live as long + as any provider built from it — hence the `static` above. Nothing is ever + registered on it; it exists only so scans can obtain a task context. + +[`examples/native/src/lib.rs`](../examples/native/src/lib.rs) is a complete, commented version of this for a `MemTable`. ### 2. The Java factory @@ -296,8 +349,12 @@ spark/ │ FfiHelperNative.java <- JNI into the connector cdylib ├── src/main/scala/io/datafusion/spark/ connector internals (DSv2 wiring, │ readers, pushdown, shared-scan cache) -└── native/ connector cdylib (widening + scan - planning/execution, Rust) +├── bridge/ datafusion-spark-bridge SDK rlib: +│ widening + scan machinery + +│ export_bridge! for static bridges +└── native/ connector cdylib: thin JNI shims for + the generic FfiHelperNative (FFI + path), all logic in bridge/ ``` ## Caveats diff --git a/spark/bridge/Cargo.toml b/spark/bridge/Cargo.toml new file mode 100644 index 0000000..26abe2a --- /dev/null +++ b/spark/bridge/Cargo.toml @@ -0,0 +1,42 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "datafusion-spark-bridge" +version = "0.1.0" +edition = "2021" +publish = false +description = "SDK for building Spark connector bridges over DataFusion TableProviders" + +[features] +default = ["ffi"] +# Import providers across a cdylib boundary as FFI_TableProvider. Bridges +# that statically link their provider via `export_bridge!` don't need it +# and can drop the datafusion-ffi dependency entirely. +ffi = ["dep:datafusion-ffi"] + +[dependencies] +arrow = { workspace = true } +async-trait = { workspace = true } +datafusion = { workspace = true } +datafusion-ffi = { workspace = true, optional = true } +datafusion-jni-common = { path = "../../native-common" } +datafusion-proto = { workspace = true } +futures = { workspace = true } +jni = { workspace = true } +prost = { workspace = true } +tokio = { workspace = true } diff --git a/spark/bridge/src/ffi.rs b/spark/bridge/src/ffi.rs new file mode 100644 index 0000000..1ac8630 --- /dev/null +++ b/spark/bridge/src/ffi.rs @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Importing providers across a cdylib boundary (the generic FFI path). + +use std::sync::Arc; + +use datafusion::catalog::TableProvider; +use datafusion_ffi::table_provider::FFI_TableProvider; +use datafusion_jni_common::errors::JniResult; +use jni::sys::jlong; + +/// Take ownership of a bridge cdylib's `FFI_TableProvider` pointer and return +/// the in-process provider view. The pointer must be the raw boxed address +/// (`Box::into_raw(Box::new(FFI_TableProvider))`) and must not be reused +/// after this call. +pub fn import_ffi_provider(ffi_raw_ptr: jlong) -> JniResult> { + if ffi_raw_ptr == 0 { + return Err("FFI_TableProvider pointer is null".into()); + } + let ffi_raw: Box = + unsafe { Box::from_raw(ffi_raw_ptr as *mut FFI_TableProvider) }; + // `Arc::::from(&FFI_TableProvider)` returns a + // ForeignTableProvider that delegates through the producer's vtable; it + // owns its own retained copy, so our Box can drop immediately. + let provider: Arc = (&*ffi_raw).into(); + drop(ffi_raw); + Ok(provider) +} diff --git a/spark/bridge/src/lib.rs b/spark/bridge/src/lib.rs new file mode 100644 index 0000000..4b6ec24 --- /dev/null +++ b/spark/bridge/src/lib.rs @@ -0,0 +1,226 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! SDK for building Spark connector bridges over DataFusion `TableProvider`s. +//! +//! Everything the Spark connector needs DataFusion-side lives here: the +//! Spark-type [`widening`] layer, and the [`scan`] machinery (session from +//! pinned config, projection, proto filters, planning, partition streams). +//! Two ways to consume it: +//! +//! - **Static bridge (preferred when you own the provider's source).** Your +//! cdylib depends on this crate and invokes [`export_bridge!`] with a +//! builder that constructs your concrete `TableProvider` from option / +//! partition bytes. One cdylib, no `datafusion-ffi` ABI boundary, your +//! choice of DataFusion version. +//! +//! - **FFI bridge (when the provider arrives precompiled).** A cdylib takes +//! a raw `FFI_TableProvider` pointer from another library and imports it +//! via [`ffi::import_ffi_provider`]. This is what the connector's own +//! `datafusion-spark-helper` cdylib does for the generic +//! `io.datafusion.spark.FfiHelperNative` path. + +pub mod scan; +pub mod widening; + +#[cfg(feature = "ffi")] +pub mod ffi; + +// Re-exported so `export_bridge!` expansions resolve these crates inside the +// bridge author's crate without extra dependencies, and so builder signatures +// can be written against `datafusion_spark_bridge::datafusion::...`. +pub use datafusion; +pub use datafusion_jni_common::errors::JniResult; +pub use jni; + +use tokio::runtime::Handle; + +/// Execution environment handed to a bridge's provider builder. +/// +/// Provider construction frequently needs async IO (remote catalogs, +/// object-store metadata); run it on the bridge runtime via [`block_on`] +/// rather than creating a runtime of your own. +/// +/// [`block_on`]: BridgeContext::block_on +pub struct BridgeContext { + handle: &'static Handle, +} + +impl BridgeContext { + /// Used by `export_bridge!` expansions; not part of the public API. + #[doc(hidden)] + pub fn get() -> Self { + BridgeContext { + handle: runtime_handle(), + } + } + + /// The cdylib-wide Tokio runtime handle (also the runtime scans run on). + pub fn handle(&self) -> &Handle { + self.handle + } + + /// Block the current (JVM) thread on `fut`, driving it on the bridge + /// runtime. + pub fn block_on(&self, fut: F) -> F::Output { + self.handle.block_on(fut) + } +} + +/// Per-cdylib Tokio runtime (the singleton from `datafusion-jni-common`). +pub(crate) fn runtime_handle() -> &'static Handle { + datafusion_jni_common::runtime().handle() +} + +/// Generate the JNI entry points for a static bridge cdylib. +/// +/// `jni_class` is the **underscore-mangled** binary name of the Java class +/// declaring the matching `native` methods: dots become underscores +/// (`com.example.mybridge.BridgeNative` → `"com_example_mybridge_BridgeNative"`). +/// If the class or package name itself contains an underscore, JNI mangling +/// requires it written as `_1`. Per-bridge class names are what let several +/// bridges coexist in one Spark JVM — never export under +/// `io_datafusion_spark_FfiHelperNative`, that name belongs to the generic +/// FFI helper. +/// +/// `build_provider` is anything callable as +/// `Fn(&BridgeContext, &[u8], &[u8]) -> JniResult>`, +/// receiving the options bytes and partition bytes your JVM factory encoded. +/// The schema probe calls it with empty partition bytes; the scan path passes +/// each task's payload. Return errors boxed from `DataFusionError` to surface +/// as the typed `org.apache.datafusion.*` exception hierarchy. +/// +/// The generated Java-side surface (declare these as `static native` on the +/// class named by `jni_class`): +/// +/// ```java +/// static native byte[] providerSchemaIpc(byte[] options, byte[] partition); +/// static native long createScan(byte[] options, byte[] partition, +/// int targetPartitions, int batchSize, String[] optionKeys, +/// String[] optionValues, String[] projectionColumns, byte[][] filterProtos); +/// static native int partitionCount(long scanHandle); +/// static native void executeStreamPartition(long scanHandle, int partition, long ffiStreamAddr); +/// static native void executeStream(long scanHandle, long ffiStreamAddr); +/// static native void closeScan(long scanHandle); +/// ``` +#[macro_export] +macro_rules! export_bridge { + (jni_class: $cls:literal, build_provider: $builder:expr $(,)?) => { + const _: () = { + use $crate::jni::objects::{JByteArray, JClass, JObjectArray}; + use $crate::jni::sys::{jbyteArray, jint, jlong}; + use $crate::jni::JNIEnv; + + fn __df_bridge_build( + env: &mut JNIEnv, + options: &JByteArray, + partition: &JByteArray, + ) -> $crate::JniResult> + { + let opts: Vec = if options.is_null() { + Vec::new() + } else { + env.convert_byte_array(options)? + }; + let part: Vec = if partition.is_null() { + Vec::new() + } else { + env.convert_byte_array(partition)? + }; + let ctx = $crate::BridgeContext::get(); + ($builder)(&ctx, opts.as_slice(), part.as_slice()) + } + + #[export_name = concat!("Java_", $cls, "_providerSchemaIpc")] + extern "system" fn __df_bridge_provider_schema_ipc<'local>( + mut env: JNIEnv<'local>, + _class: JClass<'local>, + options: JByteArray<'local>, + partition: JByteArray<'local>, + ) -> jbyteArray { + $crate::scan::provider_schema_ipc(&mut env, |env| { + __df_bridge_build(env, &options, &partition) + }) + } + + #[export_name = concat!("Java_", $cls, "_createScan")] + #[allow(clippy::too_many_arguments)] + extern "system" fn __df_bridge_create_scan<'local>( + mut env: JNIEnv<'local>, + _class: JClass<'local>, + options: JByteArray<'local>, + partition: JByteArray<'local>, + target_partitions: jint, + batch_size: jint, + option_keys: JObjectArray<'local>, + option_values: JObjectArray<'local>, + projection_columns: JObjectArray<'local>, + filter_protos: JObjectArray<'local>, + ) -> jlong { + $crate::scan::create_scan( + &mut env, + |env| __df_bridge_build(env, &options, &partition), + target_partitions, + batch_size, + &option_keys, + &option_values, + &projection_columns, + &filter_protos, + ) + } + + #[export_name = concat!("Java_", $cls, "_partitionCount")] + extern "system" fn __df_bridge_partition_count<'local>( + mut env: JNIEnv<'local>, + _class: JClass<'local>, + handle: jlong, + ) -> jint { + $crate::scan::partition_count(&mut env, handle) + } + + #[export_name = concat!("Java_", $cls, "_executeStreamPartition")] + extern "system" fn __df_bridge_execute_stream_partition<'local>( + mut env: JNIEnv<'local>, + _class: JClass<'local>, + handle: jlong, + partition: jint, + ffi_stream_addr: jlong, + ) { + $crate::scan::execute_stream_partition(&mut env, handle, partition, ffi_stream_addr) + } + + #[export_name = concat!("Java_", $cls, "_executeStream")] + extern "system" fn __df_bridge_execute_stream<'local>( + mut env: JNIEnv<'local>, + _class: JClass<'local>, + handle: jlong, + ffi_stream_addr: jlong, + ) { + $crate::scan::execute_stream(&mut env, handle, ffi_stream_addr) + } + + #[export_name = concat!("Java_", $cls, "_closeScan")] + extern "system" fn __df_bridge_close_scan<'local>( + mut env: JNIEnv<'local>, + _class: JClass<'local>, + handle: jlong, + ) { + $crate::scan::close_scan(&mut env, handle) + } + }; + }; +} diff --git a/spark/native/src/scan.rs b/spark/bridge/src/scan.rs similarity index 62% rename from spark/native/src/scan.rs rename to spark/bridge/src/scan.rs index aacfd45..43f580d 100644 --- a/spark/native/src/scan.rs +++ b/spark/bridge/src/scan.rs @@ -15,29 +15,36 @@ // specific language governing permissions and limitations // under the License. -//! Planning and execution of a Spark scan, entirely inside this cdylib. +//! Planning and execution of a Spark scan, provider-source-agnostic. //! -//! `createScan` takes ownership of a bridge's `FFI_TableProvider` pointer, -//! wraps the inner provider in a [`WideningTableProvider`] (in-process — no -//! re-FFI hop), registers it on a private `SessionContext` built from the -//! caller-pinned config, applies the pruned projection and the proto-encoded -//! pushed filters, and plans exactly once. The returned handle supports: +//! Every function here is the body of one JNI entry point; the caller (the +//! generic FFI cdylib, or a static bridge's `export_bridge!` expansion) +//! supplies only how the provider is obtained, as a `make` closure. The +//! provider is wrapped in a [`WideningTableProvider`] here, so both binding +//! styles get identical Spark-compatible Arrow types. //! -//! - `partitionCount` — output partitions of the physical plan (shared-scan -//! mode probes this on the driver and indexes tasks by it); -//! - `executeStreamPartition` — an independent stream over ONE plan +//! [`create_scan`] registers the widened provider on a private +//! `SessionContext` built from the caller-pinned config, applies the pruned +//! projection and the proto-encoded pushed filters, and plans exactly once. +//! The returned handle supports: +//! +//! - [`partition_count`] — output partitions of the physical plan +//! (shared-scan mode probes this on the driver and indexes tasks by it); +//! - [`execute_stream_partition`] — an independent stream over ONE plan //! partition, concurrently callable from multiple JVM threads //! (`ExecutionPlan` and `TaskContext` are `Send + Sync`; each call only -//! clones their `Arc`s). Re-executing the same partition index (Spark task -//! retry / speculative execution) opens its own stream, but only succeeds -//! when every operator in that partition's pipeline supports repeated -//! `execute()` — stateless scans do, `RepartitionExec` pipelines do not; -//! - `executeStream` — the whole plan as one stream (legacy per-partition -//! payload mode, where the provider itself is the task's slice); -//! - `closeScan` — drop the plan. The single unsafe interleaving is closing -//! a handle that still has an in-flight call; the Java consumer (the -//! shared-scan cache) prevents it with a refcount covering every open -//! reader. +//! clones their `Arc`s). Re-executing the same partition index (Spark +//! task retry / speculative execution) opens its own stream, but only +//! succeeds when every operator in that partition's pipeline supports +//! repeated `execute()` — stateless scans do, `RepartitionExec` +//! pipelines do not; +//! - [`execute_stream`] — the whole plan as one stream (legacy +//! per-partition payload mode, where the provider itself is the task's +//! slice); +//! - [`close_scan`] — drop the plan. The single unsafe interleaving is +//! closing a handle that still has an in-flight call; the Java consumer +//! (the shared-scan cache) prevents it with a refcount covering every +//! open reader. //! //! Pinned-config determinism: the driver resolves `target_partitions` / //! `batch_size` / option overrides once and ships them to every executor, so @@ -52,20 +59,19 @@ use datafusion::arrow::ipc::writer::StreamWriter; use datafusion::catalog::TableProvider; use datafusion::dataframe::DataFrame; use datafusion::execution::TaskContext; -use datafusion::physical_plan::{execute_stream, ExecutionPlan}; +use datafusion::physical_plan::{execute_stream as df_execute_stream, ExecutionPlan}; use datafusion::prelude::{SessionConfig, SessionContext}; -use datafusion_ffi::table_provider::FFI_TableProvider; use datafusion_jni_common::errors::{try_unwrap_or_throw, JniResult}; use datafusion_jni_common::StreamingReader; use datafusion_proto::logical_plan::from_proto::parse_expr; use datafusion_proto::logical_plan::DefaultLogicalExtensionCodec; use datafusion_proto::protobuf::LogicalExprNode; -use jni::objects::{JByteArray, JClass, JObjectArray, JString}; +use jni::objects::{JByteArray, JObjectArray, JString}; use jni::sys::{jbyteArray, jint, jlong}; use jni::JNIEnv; use prost::Message; -use crate::runtime; +use crate::runtime_handle; use crate::widening::WideningTableProvider; /// Registration name of the (single) provider on the scan's private context. @@ -81,20 +87,8 @@ struct ScanState { task_ctx: Arc, } -/// Take ownership of the bridge's `FFI_TableProvider` pointer and return the -/// widened in-process provider. -fn import_widened(ffi_raw_ptr: jlong) -> JniResult> { - if ffi_raw_ptr == 0 { - return Err("FFI_TableProvider pointer is null".into()); - } - let ffi_raw: Box = - unsafe { Box::from_raw(ffi_raw_ptr as *mut FFI_TableProvider) }; - // `Arc::::from(&FFI_TableProvider)` returns a - // ForeignTableProvider that delegates through the producer's vtable; it - // owns its own retained copy, so our Box can drop immediately. - let inner: Arc = (&*ffi_raw).into(); - drop(ffi_raw); - Ok(Arc::new(WideningTableProvider::new(inner))) +fn widen(provider: Arc) -> Arc { + Arc::new(WideningTableProvider::new(provider)) } fn collect_string_array(env: &mut JNIEnv, arr: &JObjectArray) -> JniResult> { @@ -127,54 +121,48 @@ fn collect_byte_arrays(env: &mut JNIEnv, arr: &JObjectArray) -> JniResult( - mut env: JNIEnv<'local>, - _class: JClass<'local>, - ffi_raw_ptr: jlong, +/// `make` runs once; the provider drops before returning. +pub fn provider_schema_ipc( + env: &mut JNIEnv, + make: impl FnOnce(&mut JNIEnv) -> JniResult>, ) -> jbyteArray { - try_unwrap_or_throw( - &mut env, - std::ptr::null_mut(), - |env| -> JniResult { - let widened = import_widened(ffi_raw_ptr)?; - let schema = widened.schema(); - let mut buf: Vec = Vec::new(); - { - let mut writer = StreamWriter::try_new(&mut buf, schema.as_ref())?; - writer.finish()?; - } - let arr = env.byte_array_from_slice(&buf)?; - Ok(arr.into_raw()) - }, - ) + try_unwrap_or_throw(env, std::ptr::null_mut(), |env| -> JniResult { + let widened = widen(make(env)?); + let schema = widened.schema(); + let mut buf: Vec = Vec::new(); + { + let mut writer = StreamWriter::try_new(&mut buf, schema.as_ref())?; + writer.finish()?; + } + let arr = env.byte_array_from_slice(&buf)?; + Ok(arr.into_raw()) + }) } -/// Build the scan: widen the provider, register it on a private context with -/// the pinned config, apply projection + pushed filters, plan once. +/// Build the scan: widen the provider from `make`, register it on a private +/// context with the pinned config, apply projection + pushed filters, plan +/// once. /// /// `target_partitions` / `batch_size` <= 0 leave the DataFusion defaults; /// `option_keys`/`option_values` are parallel arrays of config overrides; /// empty `projection_columns` selects all columns; each element of /// `filter_protos` is a serialized `datafusion.LogicalExprNode`. -#[no_mangle] -pub extern "system" fn Java_io_datafusion_spark_FfiHelperNative_createScan<'local>( - mut env: JNIEnv<'local>, - _class: JClass<'local>, - ffi_raw_ptr: jlong, +#[allow(clippy::too_many_arguments)] +pub fn create_scan( + env: &mut JNIEnv, + make: impl FnOnce(&mut JNIEnv) -> JniResult>, target_partitions: jint, batch_size: jint, - option_keys: JObjectArray<'local>, - option_values: JObjectArray<'local>, - projection_columns: JObjectArray<'local>, - filter_protos: JObjectArray<'local>, + option_keys: &JObjectArray, + option_values: &JObjectArray, + projection_columns: &JObjectArray, + filter_protos: &JObjectArray, ) -> jlong { - try_unwrap_or_throw(&mut env, 0, |env| -> JniResult { - let widened = import_widened(ffi_raw_ptr)?; + try_unwrap_or_throw(env, 0, |env| -> JniResult { + let widened = widen(make(env)?); - let keys = collect_string_array(env, &option_keys)?; - let values = collect_string_array(env, &option_values)?; + let keys = collect_string_array(env, option_keys)?; + let values = collect_string_array(env, option_values)?; if keys.len() != values.len() { return Err(format!( "option key/value arrays differ in length: {} vs {}", @@ -183,8 +171,8 @@ pub extern "system" fn Java_io_datafusion_spark_FfiHelperNative_createScan<'loca ) .into()); } - let projection = collect_string_array(env, &projection_columns)?; - let filters = collect_byte_arrays(env, &filter_protos)?; + let projection = collect_string_array(env, projection_columns)?; + let filters = collect_byte_arrays(env, filter_protos)?; let mut config = SessionConfig::new(); if target_partitions > 0 { @@ -200,7 +188,7 @@ pub extern "system" fn Java_io_datafusion_spark_FfiHelperNative_createScan<'loca let ctx = SessionContext::new_with_config(config); ctx.register_table(SCAN_TABLE_NAME, widened)?; - let mut df: DataFrame = runtime().block_on(ctx.table(SCAN_TABLE_NAME))?; + let mut df: DataFrame = runtime_handle().block_on(ctx.table(SCAN_TABLE_NAME))?; if !projection.is_empty() { let refs: Vec<&str> = projection.iter().map(String::as_str).collect(); df = df.select_columns(&refs)?; @@ -217,7 +205,7 @@ pub extern "system" fn Java_io_datafusion_spark_FfiHelperNative_createScan<'loca // task_ctx() borrows; capture before create_physical_plan consumes df. let task_ctx = Arc::new(df.task_ctx()); - let plan = runtime().block_on(df.create_physical_plan())?; + let plan = runtime_handle().block_on(df.create_physical_plan())?; let state = ScanState { _ctx: ctx, @@ -228,13 +216,9 @@ pub extern "system" fn Java_io_datafusion_spark_FfiHelperNative_createScan<'loca }) } -#[no_mangle] -pub extern "system" fn Java_io_datafusion_spark_FfiHelperNative_partitionCount<'local>( - mut env: JNIEnv<'local>, - _class: JClass<'local>, - handle: jlong, -) -> jint { - try_unwrap_or_throw(&mut env, 0, |_env| -> JniResult { +/// Output partition count of the planned physical plan. +pub fn partition_count(env: &mut JNIEnv, handle: jlong) -> jint { + try_unwrap_or_throw(env, 0, |_env| -> JniResult { if handle == 0 { return Err("scan handle is null".into()); } @@ -247,15 +231,16 @@ pub extern "system" fn Java_io_datafusion_spark_FfiHelperNative_partitionCount<' }) } -#[no_mangle] -pub extern "system" fn Java_io_datafusion_spark_FfiHelperNative_executeStreamPartition<'local>( - mut env: JNIEnv<'local>, - _class: JClass<'local>, +/// Open an independent stream over one plan partition, writing an +/// `FFI_ArrowArrayStream` into the caller-allocated struct at +/// `ffi_stream_addr`. +pub fn execute_stream_partition( + env: &mut JNIEnv, handle: jlong, partition: jint, ffi_stream_addr: jlong, ) { - try_unwrap_or_throw(&mut env, (), |_env| -> JniResult<()> { + try_unwrap_or_throw(env, (), |_env| -> JniResult<()> { if handle == 0 { return Err("scan handle is null".into()); } @@ -284,7 +269,7 @@ pub extern "system" fn Java_io_datafusion_spark_FfiHelperNative_executeStreamPar // tokio::spawn at execute() time (RepartitionExec et al.), which // requires a runtime context to be entered. let stream = { - let _guard = runtime().enter(); + let _guard = runtime_handle().enter(); plan.execute(partition as usize, task_ctx)? }; @@ -299,14 +284,8 @@ pub extern "system" fn Java_io_datafusion_spark_FfiHelperNative_executeStreamPar /// Whole-plan stream for legacy per-partition payload mode (the provider /// itself is the task's slice, so all plan partitions merge into one reader). -#[no_mangle] -pub extern "system" fn Java_io_datafusion_spark_FfiHelperNative_executeStream<'local>( - mut env: JNIEnv<'local>, - _class: JClass<'local>, - handle: jlong, - ffi_stream_addr: jlong, -) { - try_unwrap_or_throw(&mut env, (), |_env| -> JniResult<()> { +pub fn execute_stream(env: &mut JNIEnv, handle: jlong, ffi_stream_addr: jlong) { + try_unwrap_or_throw(env, (), |_env| -> JniResult<()> { if handle == 0 { return Err("scan handle is null".into()); } @@ -321,8 +300,8 @@ pub extern "system" fn Java_io_datafusion_spark_FfiHelperNative_executeStream<'l // execute_stream coalesces multi-partition plans behind one stream. let stream = { - let _guard = runtime().enter(); - execute_stream(plan, task_ctx)? + let _guard = runtime_handle().enter(); + df_execute_stream(plan, task_ctx)? }; let reader = StreamingReader { schema, stream }; @@ -334,13 +313,10 @@ pub extern "system" fn Java_io_datafusion_spark_FfiHelperNative_executeStream<'l }) } -#[no_mangle] -pub extern "system" fn Java_io_datafusion_spark_FfiHelperNative_closeScan<'local>( - mut env: JNIEnv<'local>, - _class: JClass<'local>, - handle: jlong, -) { - try_unwrap_or_throw(&mut env, (), |_env| -> JniResult<()> { +/// Drop the planned scan. Must not race an in-flight stream-open on the same +/// handle; the Java consumer's refcount enforces this. +pub fn close_scan(env: &mut JNIEnv, handle: jlong) { + try_unwrap_or_throw(env, (), |_env| -> JniResult<()> { if handle == 0 { return Err("scan handle is null".into()); } diff --git a/spark/native/src/widening.rs b/spark/bridge/src/widening.rs similarity index 100% rename from spark/native/src/widening.rs rename to spark/bridge/src/widening.rs diff --git a/spark/bridge/tests/export_macro.rs b/spark/bridge/tests/export_macro.rs new file mode 100644 index 0000000..14751c8 --- /dev/null +++ b/spark/bridge/tests/export_macro.rs @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Compile-level test of `export_bridge!`: the macro must expand to valid +//! `extern "system"` items against a plain builder function. JNI entry +//! points can't be exercised without a live JVM, so the assertion here is +//! that this test crate links with the generated symbols present. + +use std::sync::Arc; + +use datafusion_spark_bridge::datafusion::arrow::datatypes::Schema; +use datafusion_spark_bridge::datafusion::catalog::TableProvider; +use datafusion_spark_bridge::datafusion::datasource::MemTable; +use datafusion_spark_bridge::{export_bridge, BridgeContext, JniResult}; + +fn build_provider( + _ctx: &BridgeContext, + _options: &[u8], + _partition: &[u8], +) -> JniResult> { + let schema = Arc::new(Schema::empty()); + let table = MemTable::try_new(schema, vec![vec![]])?; + Ok(Arc::new(table)) +} + +export_bridge! { + jni_class: "com_example_testbridge_BridgeNative", + build_provider: build_provider, +} + +#[test] +fn builder_contract_runs_outside_jvm() { + // Expansion + linking is the macro test; this additionally runs the + // builder through the same BridgeContext the expansion hands it. + let ctx = BridgeContext::get(); + let provider = build_provider(&ctx, &[], &[]).expect("builder failed"); + assert_eq!(provider.schema().fields().len(), 0); +} diff --git a/spark/native/Cargo.toml b/spark/native/Cargo.toml index bd9d423..cfdb1db 100644 --- a/spark/native/Cargo.toml +++ b/spark/native/Cargo.toml @@ -15,21 +15,11 @@ edition = "2021" publish = false [lib] -# cdylib for the JVM to load via System.load; rlib so Rust-level tests can -# exercise the WideningTableProvider directly without going through JNI. -crate-type = ["cdylib", "rlib"] +# cdylib for the JVM to load via System.load. All logic lives in the +# datafusion-spark-bridge rlib; this crate is only the JNI symbol surface +# for the generic io.datafusion.spark.FfiHelperNative class. +crate-type = ["cdylib"] [dependencies] -arrow = { workspace = true } -async-trait = { workspace = true } -datafusion = { workspace = true } -datafusion-ffi = { workspace = true } -# Shared JNI plumbing: error->Java-exception mapping and the per-cdylib Tokio -# runtime singleton. The thrown classes (org.apache.datafusion.*) come from -# the datafusion-java core jar, which the Spark module already depends on. -datafusion-jni-common = { path = "../../native-common" } -datafusion-proto = { workspace = true } -futures = { workspace = true } +datafusion-spark-bridge = { path = "../bridge" } jni = { workspace = true } -prost = { workspace = true } -tokio = { workspace = true } diff --git a/spark/native/src/lib.rs b/spark/native/src/lib.rs index 038d3f9..e89918f 100644 --- a/spark/native/src/lib.rs +++ b/spark/native/src/lib.rs @@ -15,23 +15,90 @@ // specific language governing permissions and limitations // under the License. -//! Native side of the generic Spark connector. +//! Generic FFI-path cdylib behind `io.datafusion.spark.FfiHelperNative`. //! -//! Takes raw `FFI_TableProvider` pointers produced by a bridge cdylib and -//! does everything DataFusion-side in process: schema probe, widening to -//! Spark-compatible Arrow types (UInt*→signed wider, Float16→Float32, -//! Time*→Int wider, Timestamp(*, tz)→Timestamp(Microsecond, tz)), session -//! construction from the driver-pinned config, projection + proto-filter -//! application, planning, and per-partition stream execution. See [`scan`] -//! for the JNI surface and [`widening`] for the cast layer. - -use tokio::runtime::Handle; - -pub mod scan; -pub mod widening; - -/// Shared Tokio runtime (the per-cdylib singleton from -/// `datafusion-jni-common`). Planning and stream execution all run on it. -fn runtime() -> &'static Handle { - datafusion_jni_common::runtime().handle() +//! Thin JNI shims: each entry point imports the bridge cdylib's raw +//! `FFI_TableProvider` pointer and delegates to the scan machinery in +//! `datafusion-spark-bridge` (widening, session from pinned config, +//! projection, proto filters, planning, partition streams). Bridges that +//! statically link their provider use `datafusion_spark_bridge::export_bridge!` +//! with their own JNI class name instead of this library. + +use datafusion_spark_bridge::ffi::import_ffi_provider; +use datafusion_spark_bridge::scan; +use jni::objects::{JClass, JObjectArray}; +use jni::sys::{jbyteArray, jint, jlong}; +use jni::JNIEnv; + +#[no_mangle] +pub extern "system" fn Java_io_datafusion_spark_FfiHelperNative_providerSchemaIpc<'local>( + mut env: JNIEnv<'local>, + _class: JClass<'local>, + ffi_raw_ptr: jlong, +) -> jbyteArray { + scan::provider_schema_ipc(&mut env, |_env| import_ffi_provider(ffi_raw_ptr)) +} + +#[no_mangle] +#[allow(clippy::too_many_arguments)] +pub extern "system" fn Java_io_datafusion_spark_FfiHelperNative_createScan<'local>( + mut env: JNIEnv<'local>, + _class: JClass<'local>, + ffi_raw_ptr: jlong, + target_partitions: jint, + batch_size: jint, + option_keys: JObjectArray<'local>, + option_values: JObjectArray<'local>, + projection_columns: JObjectArray<'local>, + filter_protos: JObjectArray<'local>, +) -> jlong { + scan::create_scan( + &mut env, + |_env| import_ffi_provider(ffi_raw_ptr), + target_partitions, + batch_size, + &option_keys, + &option_values, + &projection_columns, + &filter_protos, + ) +} + +#[no_mangle] +pub extern "system" fn Java_io_datafusion_spark_FfiHelperNative_partitionCount<'local>( + mut env: JNIEnv<'local>, + _class: JClass<'local>, + handle: jlong, +) -> jint { + scan::partition_count(&mut env, handle) +} + +#[no_mangle] +pub extern "system" fn Java_io_datafusion_spark_FfiHelperNative_executeStreamPartition<'local>( + mut env: JNIEnv<'local>, + _class: JClass<'local>, + handle: jlong, + partition: jint, + ffi_stream_addr: jlong, +) { + scan::execute_stream_partition(&mut env, handle, partition, ffi_stream_addr) +} + +#[no_mangle] +pub extern "system" fn Java_io_datafusion_spark_FfiHelperNative_executeStream<'local>( + mut env: JNIEnv<'local>, + _class: JClass<'local>, + handle: jlong, + ffi_stream_addr: jlong, +) { + scan::execute_stream(&mut env, handle, ffi_stream_addr) +} + +#[no_mangle] +pub extern "system" fn Java_io_datafusion_spark_FfiHelperNative_closeScan<'local>( + mut env: JNIEnv<'local>, + _class: JClass<'local>, + handle: jlong, +) { + scan::close_scan(&mut env, handle) } From dc909ce82b77713a50828e38de3425e6db5d222e Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 11 Jun 2026 16:40:39 +0200 Subject: [PATCH 14/22] feat(spark): ScanBackend dispatch + one-method-minimum factory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Static export_bridge! bridges had no JVM route: the connector called FfiHelperNative statically, hardwiring every scan to the generic FFI cdylib. The plumbing now talks to a ScanBackend obtained from FfiProviderFactory.scanBackend(), so a bridge can point the connector at its own JNI class (per-bridge class names; several bridges per JVM) while the default FfiScanBackend keeps the FFI path intact. Factory methods all gain working defaults — a minimal bridge overrides exactly one method (createProvider or scanBackend): - encodeOptions: OptionsCodec, key-sorted length-prefixed UTF-8 pairs; sorted because shared-scan mode uses the bytes as the scan identity. Rust decoder in datafusion_spark_bridge::options, pinned to the Java encoder by a shared byte fixture in both test suites - listPartitions: single whole-dataset partition - createProvider: throws with guidance; static bridges never implement it Co-Authored-By: Claude Fable 5 --- spark/README.md | 70 ++++++-- spark/bridge/src/lib.rs | 1 + spark/bridge/src/options.rs | 158 ++++++++++++++++++ .../datafusion/spark/FfiProviderFactory.java | 86 +++++++--- .../io/datafusion/spark/FfiScanBackend.java | 77 +++++++++ .../io/datafusion/spark/OptionsCodec.java | 114 +++++++++++++ .../java/io/datafusion/spark/ScanBackend.java | 88 ++++++++++ .../DatafusionColumnarPartitionReader.scala | 27 ++- .../datafusion/spark/DatafusionSource.scala | 13 +- .../spark/NativeSharedScanResources.scala | 20 ++- .../FfiProviderFactoryDefaultsTest.scala | 41 +++-- .../datafusion/spark/OptionsCodecTest.scala | 89 ++++++++++ 12 files changed, 704 insertions(+), 80 deletions(-) create mode 100644 spark/bridge/src/options.rs create mode 100644 spark/src/main/java/io/datafusion/spark/FfiScanBackend.java create mode 100644 spark/src/main/java/io/datafusion/spark/OptionsCodec.java create mode 100644 spark/src/main/java/io/datafusion/spark/ScanBackend.java create mode 100644 spark/src/test/scala/io/datafusion/spark/OptionsCodecTest.scala diff --git a/spark/README.md b/spark/README.md index 4f10fb8..ebd6cb9 100644 --- a/spark/README.md +++ b/spark/README.md @@ -87,9 +87,8 @@ export_bridge! { ``` The macro's rustdoc lists the exact `static native` method set the named -Java class must declare. (JVM-side plumbing that routes the connector to a -bridge-named native class instead of the generic helper is the next step on -this path.) +Java class must declare; your factory routes the connector to it by +overriding `scanBackend()` (see section 2). **FFI (when the provider arrives precompiled, or must stay on a different DataFusion version):** one JNI function that decodes your options bytes, @@ -134,18 +133,61 @@ is a complete, commented version of this for a `MemTable`. `FfiProviderFactory` is the contract between Spark and your bridge. It must have a no-arg constructor (executors instantiate it reflectively by class -name). Three methods are required: +name). Everything has a working default — Spark options are encoded with +`OptionsCodec` (decode them in Rust via +`datafusion_spark_bridge::options::decode_options`), and `listPartitions` +reports one whole-dataset partition — so a minimal bridge overrides exactly +one method, chosen by which native path it uses. + +**Static bridge:** override `scanBackend()` to delegate to the JNI class you +named in `export_bridge!`: ```java public final class MyBridgeProviderFactory implements FfiProviderFactory { - /** Translate Spark options ("url", "table", ...) into your own bytes. */ @Override - public byte[] encodeOptions(Map sparkOptions) { - return MyBridgeOptions.fromMap(sparkOptions).toProtoBytes(); + public ScanBackend scanBackend() { + return new MyBridgeBackend(); // six one-line delegations to BridgeNative } +} + +/** Declares the native methods generated by export_bridge! and loads the cdylib. */ +final class BridgeNative { + static { /* load your cdylib once, e.g. via a NativeLibraryLoader-style helper */ } + static native byte[] providerSchemaIpc(byte[] options, byte[] partition); + static native long createScan(byte[] options, byte[] partition, + int targetPartitions, int batchSize, String[] optionKeys, + String[] optionValues, String[] projectionColumns, byte[][] filterProtos); + static native int partitionCount(long scanHandle); + static native void executeStreamPartition(long scanHandle, int partition, long ffiStreamAddr); + static native void executeStream(long scanHandle, long ffiStreamAddr); + static native void closeScan(long scanHandle); +} +``` + +(`MyBridgeBackend implements ScanBackend` forwards each method to +`BridgeNative` — pure boilerplate the scaffold will generate.) - /** Enumerate the slices of the dataset; one Spark task is created per entry. */ +**FFI bridge:** override `createProvider` instead; the default +`scanBackend()` routes the pointer through the connector's own cdylib: + +```java +public final class MyBridgeProviderFactory implements FfiProviderFactory { + + /** Build the provider for one slice. Called with EMPTY partitionBytes for + * the driver-side schema probe — schema must not depend on the slice. */ + @Override + public long createProvider(byte[] optionsProtoBytes, byte[] partitionBytes) { + return MyBridgeNative.createFfiProvider(optionsProtoBytes, partitionBytes); + } +} +``` + +Override `encodeOptions` only if the bridge already has its own options +schema (e.g. a protobuf), and `listPartitions` when the dataset should split +into more than one Spark task: + +```java @Override public PartitionInfo[] listPartitions(byte[] optionsProtoBytes) { MySlice[] slices = MyBridgeNative.listSlices(optionsProtoBytes); @@ -155,17 +197,9 @@ public final class MyBridgeProviderFactory implements FfiProviderFactory { } return out; } - - /** Build the provider for one slice. Called with EMPTY partitionBytes for - * the driver-side schema probe — schema must not depend on the slice. */ - @Override - public long createProvider(byte[] optionsProtoBytes, byte[] partitionBytes) { - return MyBridgeNative.createFfiProvider(optionsProtoBytes, partitionBytes); - } -} ``` -The optional methods — `sharedScan`, `reportPartitioning`, and the +The remaining optional methods — `sharedScan`, `reportPartitioning`, and the filter-aware `listPartitions(opts, filters)` overload — are covered in their own sections below. Their javadoc in [`FfiProviderFactory.java`](src/main/java/io/datafusion/spark/FfiProviderFactory.java) @@ -344,6 +378,8 @@ spark/ ├── src/main/java/io/datafusion/spark/ public SPI + JNI boundary (Java on │ purpose: bridge jars stay Scala-free) │ FfiProviderFactory.java <- the contract you implement +│ ScanBackend.java <- native scan surface (per-bridge +│ or the generic FfiScanBackend) │ PartitionInfo.java <- one entry = one Spark task │ ReportedPartitioning.java <- optional shuffle-elision declaration │ FfiHelperNative.java <- JNI into the connector cdylib diff --git a/spark/bridge/src/lib.rs b/spark/bridge/src/lib.rs index 4b6ec24..b9d0f82 100644 --- a/spark/bridge/src/lib.rs +++ b/spark/bridge/src/lib.rs @@ -34,6 +34,7 @@ //! `datafusion-spark-helper` cdylib does for the generic //! `io.datafusion.spark.FfiHelperNative` path. +pub mod options; pub mod scan; pub mod widening; diff --git a/spark/bridge/src/options.rs b/spark/bridge/src/options.rs new file mode 100644 index 0000000..b794561 --- /dev/null +++ b/spark/bridge/src/options.rs @@ -0,0 +1,158 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Decoder for the connector's default options wire format. +//! +//! `FfiProviderFactory.encodeOptions`'s default (`OptionsCodec` on the JVM +//! side) encodes the Spark options map as length-prefixed UTF-8 pairs, +//! sorted by key: big-endian `i32` entry count, then per entry key length, +//! key bytes, value length, value bytes. Key-sorting makes the bytes a pure +//! function of the map contents — the shared-scan determinism contract uses +//! the options bytes as the scan identity. +//! +//! Bridges using the default JVM encoding read their options here: +//! +//! ```ignore +//! let opts = datafusion_spark_bridge::options::decode_options(options_bytes)?; +//! let url = opts.get("url").ok_or("missing required option 'url'")?; +//! ``` +//! +//! The two implementations are pinned to each other by the shared fixture in +//! the tests below; `OptionsCodecTest` on the JVM side asserts the same +//! bytes. + +use std::collections::BTreeMap; + +/// Decode bytes produced by the JVM `OptionsCodec.encode` (or +/// [`encode_options`]). Empty input decodes as an empty map. +pub fn decode_options(bytes: &[u8]) -> Result, String> { + let mut out = BTreeMap::new(); + if bytes.is_empty() { + return Ok(out); + } + let mut cursor = Cursor { bytes, pos: 0 }; + let count = cursor.read_len("entry count")?; + for i in 0..count { + let key = cursor.read_string(&format!("key of entry {i}"))?; + let value = cursor.read_string(&format!("value of entry {i}"))?; + out.insert(key, value); + } + if cursor.pos != bytes.len() { + return Err(format!( + "options blob has {} trailing byte(s) after {count} entries", + bytes.len() - cursor.pos + )); + } + Ok(out) +} + +/// Encode in the same format (key-sorted via `BTreeMap`). Primarily for +/// tests and Rust-side tooling; production encoding normally happens on the +/// JVM driver. +pub fn encode_options(options: &BTreeMap) -> Vec { + let mut out = Vec::new(); + out.extend_from_slice(&(options.len() as i32).to_be_bytes()); + for (key, value) in options { + out.extend_from_slice(&(key.len() as i32).to_be_bytes()); + out.extend_from_slice(key.as_bytes()); + out.extend_from_slice(&(value.len() as i32).to_be_bytes()); + out.extend_from_slice(value.as_bytes()); + } + out +} + +struct Cursor<'a> { + bytes: &'a [u8], + pos: usize, +} + +impl Cursor<'_> { + fn read_len(&mut self, what: &str) -> Result { + if self.bytes.len() - self.pos < 4 { + return Err(format!("options blob truncated reading {what}")); + } + let raw = i32::from_be_bytes(self.bytes[self.pos..self.pos + 4].try_into().unwrap()); + self.pos += 4; + usize::try_from(raw).map_err(|_| format!("negative length for {what}: {raw}")) + } + + fn read_string(&mut self, what: &str) -> Result { + let len = self.read_len(&format!("length of {what}"))?; + if self.bytes.len() - self.pos < len { + return Err(format!("options blob truncated reading {what}")); + } + let slice = &self.bytes[self.pos..self.pos + len]; + self.pos += len; + String::from_utf8(slice.to_vec()).map_err(|e| format!("{what} is not UTF-8: {e}")) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Shared fixture: must stay byte-identical to the one asserted by the + /// JVM-side `OptionsCodecTest`. {"table": "t1", "url": "grpc://h:1"} + /// encodes (sorted: table < url) as below. + fn fixture_bytes() -> Vec { + let mut b = Vec::new(); + b.extend_from_slice(&2i32.to_be_bytes()); + for (k, v) in [("table", "t1"), ("url", "grpc://h:1")] { + b.extend_from_slice(&(k.len() as i32).to_be_bytes()); + b.extend_from_slice(k.as_bytes()); + b.extend_from_slice(&(v.len() as i32).to_be_bytes()); + b.extend_from_slice(v.as_bytes()); + } + b + } + + #[test] + fn decodes_fixture() { + let map = decode_options(&fixture_bytes()).unwrap(); + assert_eq!(map.len(), 2); + assert_eq!(map.get("table").map(String::as_str), Some("t1")); + assert_eq!(map.get("url").map(String::as_str), Some("grpc://h:1")); + } + + #[test] + fn round_trips() { + let mut map = BTreeMap::new(); + map.insert("b".to_string(), "2".to_string()); + map.insert("a".to_string(), "1".to_string()); + map.insert("unicode".to_string(), "héllo→world".to_string()); + let bytes = encode_options(&map); + assert_eq!(decode_options(&bytes).unwrap(), map); + } + + #[test] + fn empty_input_is_empty_map() { + assert!(decode_options(&[]).unwrap().is_empty()); + let empty = encode_options(&BTreeMap::new()); + assert!(decode_options(&empty).unwrap().is_empty()); + } + + #[test] + fn rejects_truncation_and_trailing_bytes() { + let bytes = fixture_bytes(); + assert!(decode_options(&bytes[..bytes.len() - 1]) + .unwrap_err() + .contains("truncated")); + let mut extended = bytes.clone(); + extended.push(0); + assert!(decode_options(&extended).unwrap_err().contains("trailing")); + } +} diff --git a/spark/src/main/java/io/datafusion/spark/FfiProviderFactory.java b/spark/src/main/java/io/datafusion/spark/FfiProviderFactory.java index df529c2..74b00e2 100644 --- a/spark/src/main/java/io/datafusion/spark/FfiProviderFactory.java +++ b/spark/src/main/java/io/datafusion/spark/FfiProviderFactory.java @@ -22,26 +22,26 @@ import java.util.Map; /** - * Bridge interface implemented per domain (HDF5, custom Iceberg, an in-house format, etc.). A bridge owns its - * own proto schema for connection options and a cdylib that produces an {@code FFI_TableProvider} - * pointer. The connector-core Spark plumbing is generic — it knows only this interface. + * Bridge interface implemented per domain (HDF5, custom Iceberg, an in-house format, etc.). A + * bridge owns its options encoding and a native scan implementation; the connector-core Spark + * plumbing is generic — it knows only this interface. * - *

      Lifecycle per Spark task: + *

      Two kinds of bridge, distinguished by which method they override: * - *

        - *
      1. {@link #encodeOptions(Map)} — driver-side, converts the Spark options map into the bridge's - * own proto bytes; ships verbatim through {@code DatafusionInputPartition}. - *
      2. {@link #listPartitions(byte[])} — driver-side, enumerates partitions as {@link - * PartitionInfo} entries. One Spark task is created per entry. Each entry carries an opaque - * {@code partitionBytes} payload that is shipped to the executor and replayed into {@link - * #createProvider(byte[], byte[])}, plus optional {@code preferredLocations} hostnames that - * drive Spark's data-locality scheduling. - *
      3. {@link #createProvider(byte[], byte[])} — executor-side, builds the bridge's {@code - * Arc<dyn TableProvider>} for this specific partition, wraps it in an {@code - * FFI_TableProvider}, returns the raw boxed pointer as a {@code jlong}. The caller owns this - * pointer and is responsible for handing it to exactly one consumer (the consumer's {@code - * Drop} releases it). - *
      + *
        + *
      • Static bridge (preferred when the provider's Rust source is yours): the cdylib is + * built with {@code datafusion_spark_bridge::export_bridge!} and constructs the provider + * from the options/partition bytes natively. Override {@link #scanBackend()} to delegate to + * the JNI class named in the macro; {@link #createProvider(byte[], byte[])} is never called. + *
      • FFI bridge (the provider arrives precompiled, or must stay on a different DataFusion + * version): override {@link #createProvider(byte[], byte[])} to return a raw {@code + * FFI_TableProvider} pointer; the default {@link #scanBackend()} routes it through the + * connector's own cdylib. + *
      + * + *

      Everything else has a working default: {@link #encodeOptions(Map)} encodes the Spark options + * via {@link OptionsCodec}, and {@link #listPartitions(byte[])} reports a single partition. A + * minimal bridge therefore overrides exactly one method. * *

      Implementations must be no-arg constructable so the Spark connector can instantiate them * reflectively via {@link Class#forName(String)} on the executor. @@ -49,11 +49,19 @@ public interface FfiProviderFactory { /** - * Convert Spark's flat option map to the bridge's proto-encoded options. Driver-side only. + * Convert Spark's flat option map to the bridge's encoded options. Driver-side only; the bytes + * ship verbatim through {@code DatafusionInputPartition} and are the scan's identity in + * shared-scan mode (encode deterministically). + * + *

      Default: {@link OptionsCodec#encode(Map)} — the key-sorted length-prefixed pair format that + * {@code datafusion_spark_bridge::options} decodes on the Rust side. Override only if the bridge + * already has its own options schema (e.g. a protobuf). * * @throws IllegalArgumentException if required options are missing or invalid */ - byte[] encodeOptions(Map sparkOptions); + default byte[] encodeOptions(Map sparkOptions) { + return OptionsCodec.encode(sparkOptions); + } /** * Enumerate partitions for this dataset. One Spark task is created per returned {@link @@ -67,8 +75,17 @@ public interface FfiProviderFactory { *

      Each partition's {@code preferredLocations} hostnames are returned from {@code * InputPartition.preferredLocations()} so Spark co-locates the task with the data; empty array = * no preference. + * + *

      Default: one partition ({@code "p0"}, empty payload, no host preference) — one Spark task + * scans the whole dataset. Fine for small tables and first bring-up; override (or opt into + * {@link #sharedScan(byte[])}) before pointing it at anything large. Size guidance lives in + * {@code spark/README.md}. */ - PartitionInfo[] listPartitions(byte[] optionsProtoBytes); + default PartitionInfo[] listPartitions(byte[] optionsProtoBytes) { + return new PartitionInfo[] { + new PartitionInfo("p0", new byte[0], new String[0]) + }; + } /** * Filter-aware variant of {@link #listPartitions(byte[])}. The connector calls this overload with @@ -123,14 +140,37 @@ default boolean sharedScan(byte[] optionsProtoBytes) { /** * Build the underlying {@code Arc} for one partition and wrap it in an {@code * FFI_TableProvider}. Returns the raw {@code Box::into_raw} pointer as a {@code jlong}; the - * caller takes ownership. + * caller takes ownership. Only the FFI path ({@link FfiScanBackend}, the {@link #scanBackend()} + * default) calls this — static bridges override {@link #scanBackend()} instead and leave this + * default in place. * * @param optionsProtoBytes global options produced by {@link #encodeOptions(Map)} * @param partitionBytes per-partition slice payload from {@link PartitionInfo#partitionBytes()}. * Empty array for single-partition tables and for the driver-side schema probe in {@code * DatafusionSource.inferSchema}. */ - long createProvider(byte[] optionsProtoBytes, byte[] partitionBytes); + default long createProvider(byte[] optionsProtoBytes, byte[] partitionBytes) { + throw new UnsupportedOperationException( + getClass().getName() + + " uses the default FFI scan backend but does not implement createProvider. " + + "Override createProvider (FFI bridge) or scanBackend (static export_bridge! " + + "bridge)."); + } + + /** + * The native scan implementation this bridge talks to. Called wherever the connector needs + * native work — driver-side schema/plan probes and executor-side streams — always on a factory + * freshly instantiated from its class name, so the returned backend never has to be + * serializable. + * + *

      Default: the generic FFI path ({@link FfiScanBackend} over {@link + * #createProvider(byte[], byte[])} and the connector's own cdylib). Static bridges built with + * {@code datafusion_spark_bridge::export_bridge!} override this to return a backend that loads + * their cdylib and delegates each method to the JNI class named in the macro invocation. + */ + default ScanBackend scanBackend() { + return new FfiScanBackend(this); + } /** * Declare how rows are partitioned across the {@link PartitionInfo} entries returned by {@link diff --git a/spark/src/main/java/io/datafusion/spark/FfiScanBackend.java b/spark/src/main/java/io/datafusion/spark/FfiScanBackend.java new file mode 100644 index 0000000..f1b6ad2 --- /dev/null +++ b/spark/src/main/java/io/datafusion/spark/FfiScanBackend.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.datafusion.spark; + +/** + * Generic FFI {@link ScanBackend}: asks the factory for a raw {@code FFI_TableProvider} pointer + * and routes everything through the connector's own cdylib ({@link FfiHelperNative}). This is the + * {@link FfiProviderFactory#scanBackend()} default; bridges that statically link their provider + * via {@code export_bridge!} replace it with a backend delegating to their own native class. + */ +public final class FfiScanBackend implements ScanBackend { + + private final FfiProviderFactory factory; + + public FfiScanBackend(FfiProviderFactory factory) { + this.factory = factory; + } + + @Override + public byte[] providerSchemaIpc(byte[] options, byte[] partitionBytes) { + long ptr = factory.createProvider(options, partitionBytes); + return FfiHelperNative.providerSchemaIpc(ptr); + } + + @Override + public long createScan( + byte[] options, + byte[] partitionBytes, + int targetPartitions, + int batchSize, + String[] optionKeys, + String[] optionValues, + String[] projectionColumns, + byte[][] filterProtos) { + long ptr = factory.createProvider(options, partitionBytes); + return FfiHelperNative.createScan( + ptr, targetPartitions, batchSize, optionKeys, optionValues, projectionColumns, + filterProtos); + } + + @Override + public int partitionCount(long scanHandle) { + return FfiHelperNative.partitionCount(scanHandle); + } + + @Override + public void executeStreamPartition(long scanHandle, int partition, long ffiStreamAddr) { + FfiHelperNative.executeStreamPartition(scanHandle, partition, ffiStreamAddr); + } + + @Override + public void executeStream(long scanHandle, long ffiStreamAddr) { + FfiHelperNative.executeStream(scanHandle, ffiStreamAddr); + } + + @Override + public void closeScan(long scanHandle) { + FfiHelperNative.closeScan(scanHandle); + } +} diff --git a/spark/src/main/java/io/datafusion/spark/OptionsCodec.java b/spark/src/main/java/io/datafusion/spark/OptionsCodec.java new file mode 100644 index 0000000..092ec2a --- /dev/null +++ b/spark/src/main/java/io/datafusion/spark/OptionsCodec.java @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.datafusion.spark; + +import java.io.ByteArrayOutputStream; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.TreeMap; + +/** + * Default wire format for {@link FfiProviderFactory#encodeOptions(Map)}: the Spark options map as + * length-prefixed UTF-8 pairs, sorted by key. + * + *

      Layout (all integers big-endian {@code int32}): entry count, then per entry key length, key + * bytes, value length, value bytes. Key-sorting makes the bytes a pure function of the map's + * contents regardless of source iteration order — required by the shared-scan determinism + * contract, where the options bytes are the cache/plan identity. + * + *

      The Rust decoder lives in {@code datafusion_spark_bridge::options}; bridges using the default + * {@code encodeOptions} read their options there as a {@code BTreeMap}. The two + * implementations are pinned to each other by a shared test fixture. + */ +public final class OptionsCodec { + + private OptionsCodec() {} + + /** Encode {@code options} sorted by key. {@code null} or empty map encodes as count 0. */ + public static byte[] encode(Map options) { + TreeMap sorted = + options == null ? new TreeMap<>() : new TreeMap<>(options); + ByteArrayOutputStream out = new ByteArrayOutputStream(); + writeInt(out, sorted.size()); + for (Map.Entry e : sorted.entrySet()) { + if (e.getKey() == null || e.getValue() == null) { + throw new IllegalArgumentException("OptionsCodec does not accept null keys or values"); + } + writeBytes(out, e.getKey().getBytes(StandardCharsets.UTF_8)); + writeBytes(out, e.getValue().getBytes(StandardCharsets.UTF_8)); + } + return out.toByteArray(); + } + + /** Decode bytes produced by {@link #encode(Map)}. Preserves the encoded (sorted) order. */ + public static Map decode(byte[] bytes) { + Map out = new LinkedHashMap<>(); + if (bytes == null || bytes.length == 0) { + return out; + } + ByteBuffer buf = ByteBuffer.wrap(bytes); + int count = readCount(buf, "entry count"); + for (int i = 0; i < count; i++) { + String key = readString(buf, "key of entry " + i); + String value = readString(buf, "value of entry " + i); + out.put(key, value); + } + if (buf.hasRemaining()) { + throw new IllegalArgumentException( + "OptionsCodec: " + buf.remaining() + " trailing byte(s) after " + count + " entries"); + } + return out; + } + + private static void writeInt(ByteArrayOutputStream out, int v) { + out.write((v >>> 24) & 0xFF); + out.write((v >>> 16) & 0xFF); + out.write((v >>> 8) & 0xFF); + out.write(v & 0xFF); + } + + private static void writeBytes(ByteArrayOutputStream out, byte[] bytes) { + writeInt(out, bytes.length); + out.write(bytes, 0, bytes.length); + } + + private static int readCount(ByteBuffer buf, String what) { + if (buf.remaining() < 4) { + throw new IllegalArgumentException("OptionsCodec: truncated " + what); + } + int v = buf.getInt(); + if (v < 0) { + throw new IllegalArgumentException("OptionsCodec: negative " + what + ": " + v); + } + return v; + } + + private static String readString(ByteBuffer buf, String what) { + int len = readCount(buf, "length of " + what); + if (buf.remaining() < len) { + throw new IllegalArgumentException("OptionsCodec: truncated " + what); + } + byte[] bytes = new byte[len]; + buf.get(bytes); + return new String(bytes, StandardCharsets.UTF_8); + } +} diff --git a/spark/src/main/java/io/datafusion/spark/ScanBackend.java b/spark/src/main/java/io/datafusion/spark/ScanBackend.java new file mode 100644 index 0000000..4bc9bc4 --- /dev/null +++ b/spark/src/main/java/io/datafusion/spark/ScanBackend.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.datafusion.spark; + +/** + * Native scan surface the connector plumbing talks to. One method per JNI entry point of the + * {@code datafusion-spark-bridge} scan machinery; implementations only differ in which + * native library and class the calls land on: + * + *

        + *
      • {@link FfiScanBackend} (the {@link FfiProviderFactory#scanBackend()} default) builds the + * provider via {@link FfiProviderFactory#createProvider(byte[], byte[])} and routes through + * the connector's own cdylib ({@link FfiHelperNative}) — the generic FFI path. + *
      • A static bridge supplies its own implementation delegating to the class it named in its + * {@code export_bridge!} invocation, whose generated {@code createScan} builds the provider + * from {@code options}/{@code partitionBytes} directly — no pointer handover, no + * {@code datafusion-ffi}. + *
      + * + *

      Implementations must be stateless or thread-safe: the driver probes schemas and plans through + * one instance while executor tasks stream through others, and scan handles are shared across + * threads by the shared-scan cache. Handle-based methods accept handles produced by {@code + * createScan} on any instance of the same implementation. + */ +public interface ScanBackend { + + /** + * Driver-side schema probe: the widened Arrow schema of the provider described by {@code + * options} + {@code partitionBytes}, serialized as Arrow IPC bytes (deserialize with {@code + * MessageSerializer.deserializeSchema}). + */ + byte[] providerSchemaIpc(byte[] options, byte[] partitionBytes); + + /** + * Build a planned scan and return its handle. {@code targetPartitions}/{@code batchSize} {@code + * <= 0} leave DataFusion defaults; {@code optionKeys}/{@code optionValues} are parallel config + * override arrays; empty {@code projectionColumns} selects all columns; each {@code + * filterProtos} element is a serialized {@code datafusion.LogicalExprNode}. + * + *

      The caller owns the handle and must pair it with {@link #closeScan(long)}. Closing while a + * stream opened from the handle is in flight is undefined behaviour — the shared-scan cache's + * refcount enforces this; any other caller must serialize close itself. + */ + long createScan( + byte[] options, + byte[] partitionBytes, + int targetPartitions, + int batchSize, + String[] optionKeys, + String[] optionValues, + String[] projectionColumns, + byte[][] filterProtos); + + /** Output partition count of the planned physical plan. */ + int partitionCount(long scanHandle); + + /** + * Open an independent stream over ONE plan partition, writing an {@code FFI_ArrowArrayStream} + * into the caller-allocated struct at {@code ffiStreamAddr}. Concurrent-safe across JVM threads. + */ + void executeStreamPartition(long scanHandle, int partition, long ffiStreamAddr); + + /** + * Stream the WHOLE plan (all partitions coalesced) into the caller-allocated {@code + * FFI_ArrowArrayStream} at {@code ffiStreamAddr}. Used by legacy per-partition payload mode. + */ + void executeStream(long scanHandle, long ffiStreamAddr); + + /** Drop the planned scan. See {@link #createScan} for the close-vs-in-flight contract. */ + void closeScan(long scanHandle); +} diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionColumnarPartitionReader.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionColumnarPartitionReader.scala index c59778d..6e59b3d 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionColumnarPartitionReader.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionColumnarPartitionReader.scala @@ -28,14 +28,13 @@ import org.apache.spark.sql.vectorized.ColumnarBatch /** * Per-task columnar reader for the per-partition payload (legacy) path. Lifecycle: * - * 1. Reflectively instantiate the bridge's `FfiProviderFactory` (no-arg). - * 2. `createProvider(optionsProtoBytes, partitionBytes)` — bridge builds an `Arc` materialising the slice described by `partitionBytes`, wraps it in an - * `FFI_TableProvider`, returns the raw pointer. - * 3. `FfiHelperNative.createScan` does the rest natively: widening wrap, private + * 1. Reflectively instantiate the bridge's `FfiProviderFactory` (no-arg) and take its + * [[ScanBackend]]. + * 2. `backend.createScan(options, partitionBytes, ...)` — builds the provider for the slice + * described by `partitionBytes` and does the rest natively: widening wrap, private * `SessionContext`, projection, pushed proto filters, physical plan. - * 4. `executeStream` streams the whole plan (the provider already IS the task's slice); - * batches surface through [[ArrowColumnarBatchIteration]]. + * 3. `backend.executeStream` streams the whole plan (the provider already IS the task's + * slice); batches surface through [[ArrowColumnarBatchIteration]]. */ class DatafusionColumnarPartitionReader( partition: DatafusionInputPartition, @@ -45,13 +44,13 @@ class DatafusionColumnarPartitionReader( private val allocator = new RootAllocator(Long.MaxValue) - private val factory: FfiProviderFactory = instantiateFactory(partition.factoryFqcn) + private val backend: ScanBackend = instantiateFactory(partition.factoryFqcn).scanBackend() private val scanHandle: Long = try { - val rawPtr = factory.createProvider(partition.optionsProtoBytes, partition.partitionBytes) - FfiHelperNative.createScan( - rawPtr, + backend.createScan( + partition.optionsProtoBytes, + partition.partitionBytes, /* targetPartitions = */ -1, /* batchSize = */ -1, Array.empty[String], @@ -69,11 +68,11 @@ class DatafusionColumnarPartitionReader( override protected val arrowReader: ArrowReader = try { FfiStream.importReader(allocator) { addr => - FfiHelperNative.executeStream(scanHandle, addr) + backend.executeStream(scanHandle, addr) } } catch { case t: Throwable => - try FfiHelperNative.closeScan(scanHandle) + try backend.closeScan(scanHandle) catch { case suppressed: Throwable => t.addSuppressed(suppressed) } try allocator.close() catch { case suppressed: Throwable => t.addSuppressed(suppressed) } @@ -86,7 +85,7 @@ class DatafusionColumnarPartitionReader( try f catch { case t: Throwable => if (first == null) first = t else first.addSuppressed(t) } safe(arrowReader.close()) - safe(FfiHelperNative.closeScan(scanHandle)) + safe(backend.closeScan(scanHandle)) safe(allocator.close()) if (first != null) throw first } diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionSource.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionSource.scala index 58a5884..320cc76 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionSource.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionSource.scala @@ -36,11 +36,11 @@ import org.apache.spark.sql.util.CaseInsensitiveStringMap * - Subclass and override [[shortName]] + [[factoryFqcn]] (the short-name shim pattern), or * - Use this class directly with `option("df.factory", "fully.qualified.FactoryClass")`. * - * Schema discovery happens driver-side inside the connector cdylib: the factory's - * `FFI_TableProvider` is built and handed to `FfiHelperNative.providerSchemaIpc`, which widens it - * and returns its Arrow schema as IPC bytes. The same `optionsProtoBytes` (and the factory FQCN) - * is then carried verbatim through `DatafusionInputPartition`, so each executor task repeats the - * same factory → createScan pipeline locally. + * Schema discovery happens driver-side inside the bridge's native scan backend + * (`ScanBackend.providerSchemaIpc`), which widens the provider and returns its Arrow schema as + * IPC bytes. The same `optionsProtoBytes` (and the factory FQCN) is then carried verbatim through + * `DatafusionInputPartition`, so each executor task repeats the same factory → backend pipeline + * locally. */ class DatafusionSource extends TableProvider with DataSourceRegister { @@ -68,8 +68,7 @@ class DatafusionSource extends TableProvider with DataSourceRegister { val optionsBytes = factory.encodeOptions(options.asCaseSensitiveMap()) // Schema probe: pass empty partitionBytes — bridges are required to honour an empty // payload for the driver-side probe (schema must not depend on per-partition state). - val rawPtr = factory.createProvider(optionsBytes, Array.emptyByteArray) - val ipcBytes = FfiHelperNative.providerSchemaIpc(rawPtr) + val ipcBytes = factory.scanBackend().providerSchemaIpc(optionsBytes, Array.emptyByteArray) val arrowSchema = MessageSerializer.deserializeSchema( new ReadChannel(Channels.newChannel(new ByteArrayInputStream(ipcBytes)))) ArrowToSparkSchema.toSparkSchema(arrowSchema) diff --git a/spark/src/main/scala/io/datafusion/spark/NativeSharedScanResources.scala b/spark/src/main/scala/io/datafusion/spark/NativeSharedScanResources.scala index 3009737..dbf7e02 100644 --- a/spark/src/main/scala/io/datafusion/spark/NativeSharedScanResources.scala +++ b/spark/src/main/scala/io/datafusion/spark/NativeSharedScanResources.scala @@ -24,8 +24,8 @@ import org.apache.arrow.vector.ipc.ArrowReader import org.apache.spark.internal.Logging /** - * JNI-backed shared-scan entry: one provider, one planned scan handle inside the connector - * cdylib. + * JNI-backed shared-scan entry: one provider, one planned scan handle inside the bridge's native + * scan backend. * * The build sequence is the single code path for BOTH the driver-side partition-count probe and * every executor's cache entry — identical widening, registration, projection, filters, and @@ -34,10 +34,11 @@ import org.apache.spark.internal.Logging */ private[spark] final class NativeSharedScanResources( allocator: RootAllocator, + backend: ScanBackend, scanHandle: Long ) extends SharedScanResources { - override def partitionCount: Int = FfiHelperNative.partitionCount(scanHandle) + override def partitionCount: Int = backend.partitionCount(scanHandle) override def newTaskAllocator(name: String): BufferAllocator = allocator.newChildAllocator(name, 0, Long.MaxValue) @@ -46,7 +47,7 @@ private[spark] final class NativeSharedScanResources( partition: Int, taskAllocator: BufferAllocator): ArrowReader = FfiStream.importReader(taskAllocator) { addr => - FfiHelperNative.executeStreamPartition(scanHandle, partition, addr) + backend.executeStreamPartition(scanHandle, partition, addr) } override def close(): Unit = { @@ -54,7 +55,7 @@ private[spark] final class NativeSharedScanResources( def safe(f: => Unit): Unit = try f catch { case t: Throwable => if (first == null) first = t else first.addSuppressed(t) } - safe(FfiHelperNative.closeScan(scanHandle)) + safe(backend.closeScan(scanHandle)) safe(allocator.close()) if (first != null) throw first } @@ -72,14 +73,15 @@ private[spark] object NativeSharedScanResources extends Logging { .getDeclaredConstructor() .newInstance() .asInstanceOf[FfiProviderFactory] + val backend = factory.scanBackend() val allocator = new RootAllocator(Long.MaxValue) try { // Shared mode builds the dataset-wide provider: empty partitionBytes, like the // driver-side schema probe. DataFusion-native partitioning replaces listPartitions. - val rawPtr = factory.createProvider(spec.optionsProtoBytes, Array.emptyByteArray) - val scanHandle = FfiHelperNative.createScan( - rawPtr, + val scanHandle = backend.createScan( + spec.optionsProtoBytes, + Array.emptyByteArray, spec.pinnedConfig.targetPartitions, spec.pinnedConfig.batchSize, spec.pinnedConfig.options.map(_._1).toArray, @@ -87,7 +89,7 @@ private[spark] object NativeSharedScanResources extends Logging { spec.projectionColumnNames, spec.filterProtoBytes ) - new NativeSharedScanResources(allocator, scanHandle) + new NativeSharedScanResources(allocator, backend, scanHandle) } catch { case t: Throwable => try allocator.close() diff --git a/spark/src/test/scala/io/datafusion/spark/FfiProviderFactoryDefaultsTest.scala b/spark/src/test/scala/io/datafusion/spark/FfiProviderFactoryDefaultsTest.scala index 4dcb3a9..ff6c8ef 100644 --- a/spark/src/test/scala/io/datafusion/spark/FfiProviderFactoryDefaultsTest.scala +++ b/spark/src/test/scala/io/datafusion/spark/FfiProviderFactoryDefaultsTest.scala @@ -19,33 +19,54 @@ package io.datafusion.spark -import java.util.{Map => JMap} - import org.scalatest.funsuite.AnyFunSuite class FfiProviderFactoryDefaultsTest extends AnyFunSuite { - /** Minimal factory implementing only the abstract methods — exercises the defaults. */ + /** Factory overriding only listPartitions (to spy on its inputs). */ private class MinimalFactory extends FfiProviderFactory { var lastListPartitionsOpts: Array[Byte] = _ - override def encodeOptions(sparkOptions: JMap[String, String]): Array[Byte] = - Array.emptyByteArray - override def listPartitions(optionsProtoBytes: Array[Byte]): Array[PartitionInfo] = { lastListPartitionsOpts = optionsProtoBytes Array(new PartitionInfo("p0", Array.emptyByteArray, Array.empty[String])) } - - override def createProvider( - optionsProtoBytes: Array[Byte], - partitionBytes: Array[Byte]): Long = 0L } + /** Every method left at its default — the literal minimum a bridge can ship. */ + private class EmptyFactory extends FfiProviderFactory + test("sharedScan defaults to false") { assert(!new MinimalFactory().sharedScan(Array[Byte](1, 2, 3))) } + test("default encodeOptions uses OptionsCodec") { + val opts = new java.util.HashMap[String, String]() + opts.put("url", "grpc://h:1") + val bytes = new EmptyFactory().encodeOptions(opts) + assert(java.util.Arrays.equals(bytes, OptionsCodec.encode(opts))) + assert(OptionsCodec.decode(bytes).get("url") == "grpc://h:1") + } + + test("default listPartitions reports a single whole-dataset partition") { + val partitions = new EmptyFactory().listPartitions(Array[Byte](1)) + assert(partitions.length == 1) + assert(partitions(0).id == "p0") + assert(partitions(0).partitionBytes().isEmpty) + assert(partitions(0).preferredLocations().isEmpty) + } + + test("default createProvider rejects with guidance toward scanBackend") { + val e = intercept[UnsupportedOperationException] { + new EmptyFactory().createProvider(Array.emptyByteArray, Array.emptyByteArray) + } + assert(e.getMessage.contains("scanBackend")) + } + + test("default scanBackend is the FFI path") { + assert(new EmptyFactory().scanBackend().isInstanceOf[FfiScanBackend]) + } + test("filter-aware listPartitions delegates to the filter-unaware overload") { val factory = new MinimalFactory val opts = Array[Byte](7, 8) diff --git a/spark/src/test/scala/io/datafusion/spark/OptionsCodecTest.scala b/spark/src/test/scala/io/datafusion/spark/OptionsCodecTest.scala new file mode 100644 index 0000000..59f6c8f --- /dev/null +++ b/spark/src/test/scala/io/datafusion/spark/OptionsCodecTest.scala @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.datafusion.spark + +import java.io.ByteArrayOutputStream +import java.nio.charset.StandardCharsets + +import org.scalatest.funsuite.AnyFunSuite + +class OptionsCodecTest extends AnyFunSuite { + + /** + * Shared fixture: must stay byte-identical to the one asserted by the Rust-side + * `datafusion_spark_bridge::options` tests. {"table": "t1", "url": "grpc://h:1"} encodes + * (sorted: table < url) as below. + */ + private def fixtureBytes(): Array[Byte] = { + val out = new ByteArrayOutputStream() + def writeInt(v: Int): Unit = { + out.write((v >>> 24) & 0xFF); out.write((v >>> 16) & 0xFF) + out.write((v >>> 8) & 0xFF); out.write(v & 0xFF) + } + def writeString(s: String): Unit = { + val b = s.getBytes(StandardCharsets.UTF_8) + writeInt(b.length) + out.write(b, 0, b.length) + } + writeInt(2) + Seq("table" -> "t1", "url" -> "grpc://h:1").foreach { case (k, v) => + writeString(k); writeString(v) + } + out.toByteArray + } + + test("encodes the cross-language fixture byte-identically, sorted by key") { + // Insertion order deliberately unsorted; encoding must sort. + val opts = new java.util.LinkedHashMap[String, String]() + opts.put("url", "grpc://h:1") + opts.put("table", "t1") + assert(java.util.Arrays.equals(OptionsCodec.encode(opts), fixtureBytes())) + } + + test("round-trips including unicode values") { + val opts = new java.util.HashMap[String, String]() + opts.put("a", "1") + opts.put("unicode", "héllo→world") + val decoded = OptionsCodec.decode(OptionsCodec.encode(opts)) + assert(decoded.size() == 2) + assert(decoded.get("unicode") == "héllo→world") + } + + test("null and empty maps encode to a zero count and decode back empty") { + assert(OptionsCodec.decode(OptionsCodec.encode(null)).isEmpty) + assert(OptionsCodec.decode(Array.emptyByteArray).isEmpty) + } + + test("rejects truncation and trailing bytes") { + val bytes = fixtureBytes() + intercept[IllegalArgumentException] { + OptionsCodec.decode(java.util.Arrays.copyOf(bytes, bytes.length - 1)) + } + intercept[IllegalArgumentException] { + OptionsCodec.decode(java.util.Arrays.copyOf(bytes, bytes.length + 1)) + } + } + + test("rejects null keys or values") { + val opts = new java.util.HashMap[String, String]() + opts.put("k", null) + intercept[IllegalArgumentException] { OptionsCodec.encode(opts) } + } +} From 45c9613831f0663b59ed107b6568fbc14bd89bea Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 11 Jun 2026 16:49:49 +0200 Subject: [PATCH 15/22] feat(spark): reusable native loader + bridge packaging recipe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bridges hand-rolled cdylib extraction and shipped multi-jar setups. NativeLibraryLoader goes public with load(anchor, resourcePrefix, name) — anchor-classloader lookup so it works under Spark's per-app classloaders, idempotent per (prefix, name) across per-task factory instantiation, failures clear the guard so retries can succeed. The examples module becomes the living packaging template: the example cdylib is bundled into the jar via an antrun copy + per-host profiles (same pattern as the connector pom), and the 40-line path-searching loader collapses to one NativeLibraryLoader.load call with -Dexample.ffi.lib.path as the unpackaged-build escape hatch. The pyspark demo drops its os.chdir(REPO_ROOT) crutch. spark/README.md gains "Packaging your bridge": single shaded fat jar with ServicesResourceTransformer, what stays provided, and the hard rule — no shade relocations, JNI binds natives by class FQCN — with the userClassPathFirst consequence for Spark's bundled Arrow. Co-Authored-By: Claude Fable 5 --- examples/README.md | 18 ++-- examples/pom.xml | 94 +++++++++++++++++++ examples/python/ffi_table_provider_demo.py | 9 +- .../FfiTableProviderExampleNative.java | 78 ++++----------- spark/README.md | 87 +++++++++++++++++ .../datafusion/spark/NativeLibraryLoader.java | 51 ++++++++-- 6 files changed, 253 insertions(+), 84 deletions(-) diff --git a/examples/README.md b/examples/README.md index 6f76c4e..6876810 100644 --- a/examples/README.md +++ b/examples/README.md @@ -77,17 +77,13 @@ To build its cdylib (workspace member, buildable from anywhere in the tree): cargo build -p datafusion-java-ffi-example --release ``` -The factory's `System.load` searches, in order: - -1. `-Dexample.ffi.lib.path=/abs/path/to/lib...` (explicit override) -2. `rust-target/release/` (cwd = repo root) -3. `rust-target/debug/` -4. `../rust-target/release/` (cwd = the `examples` module) -5. `../rust-target/debug/` - -where `` is `libdatafusion_java_ffi_example.so` (Linux), -`libdatafusion_java_ffi_example.dylib` (macOS), or -`datafusion_java_ffi_example.dll` (Windows). +Building the examples jar then bundles the cdylib inside it (under +`org/apache/datafusion/examples///`), and the factory loads it from +there at runtime via the connector's `NativeLibraryLoader` — the same +packaging recipe a real bridge uses (see "Packaging your bridge" in +[`../spark/README.md`](../spark/README.md)). To run against an unpackaged +local build instead, pass +`-Dexample.ffi.lib.path=/abs/path/to/libdatafusion_java_ffi_example.{so,dylib}`. ## Troubleshooting diff --git a/examples/pom.xml b/examples/pom.xml index 6220afe..1f156c6 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -37,6 +37,9 @@ under the License. true true + + debug @@ -91,6 +94,97 @@ under the License. + + + org.apache.maven.plugins + maven-antrun-plugin + 3.1.0 + + + copy-ffi-example-cdylib + process-classes + run + + + + + + + + + + + + + + + + + + native-linux-amd64 + + unixlinuxamd64 + + + linux + x86_64 + libdatafusion_java_ffi_example.so + + + + native-linux-x86_64 + + unixlinuxx86_64 + + + linux + x86_64 + libdatafusion_java_ffi_example.so + + + + native-linux-aarch64 + + unixlinuxaarch64 + + + linux + aarch64 + libdatafusion_java_ffi_example.so + + + + native-mac-x86_64 + + macx86_64 + + + darwin + x86_64 + libdatafusion_java_ffi_example.dylib + + + + native-mac-aarch64 + + macaarch64 + + + darwin + aarch64 + libdatafusion_java_ffi_example.dylib + + + diff --git a/examples/python/ffi_table_provider_demo.py b/examples/python/ffi_table_provider_demo.py index 45510bb..1cff37b 100644 --- a/examples/python/ffi_table_provider_demo.py +++ b/examples/python/ffi_table_provider_demo.py @@ -141,11 +141,10 @@ def main() -> None: .getOrCreate() ) - # The example cdylib (libdatafusion_java_ffi_example.{so,dylib}) is loaded - # by FfiTableProviderExampleNative from examples/native/target. As long as - # PySpark is launched from the repo root the relative-path search succeeds; - # otherwise set example.ffi.lib.path via spark.driver.extraJavaOptions. - os.chdir(REPO_ROOT) + # The example cdylib is bundled inside the examples jar and extracted by + # NativeLibraryLoader at first use; no working-directory or path setup is + # needed. (-Dexample.ffi.lib.path via extraJavaOptions overrides it for + # unpackaged local builds.) # `name_prefix`, `num_rows`, `num_batches` are interpreted by # ExampleFfiProviderFactory.encodeOptions and decoded on the Rust side diff --git a/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExampleNative.java b/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExampleNative.java index dc0cdda..a34e9c3 100644 --- a/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExampleNative.java +++ b/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExampleNative.java @@ -19,37 +19,35 @@ package org.apache.datafusion.examples; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.Locale; +import io.datafusion.spark.NativeLibraryLoader; /** * JNI bindings into the example cdylib at {@code examples/native}. The cdylib produces a small * {@code MemTable}-backed {@code FFI_TableProvider} that {@link ExampleFfiProviderFactory} hands to * the Spark connector ({@code FfiHelperNative.createScan}). * - *

      The library is located in this order: - * - *

        - *
      1. Absolute path passed via {@code -Dexample.ffi.lib.path=/abs/path/to/lib...}. - *
      2. {@code rust-target/release/} relative to the current working directory (the - * workspace output dir; default when invoked via {@code mvn exec:java} from the repo root). - *
      3. {@code rust-target/debug/} as a fallback for {@code cargo build} without {@code - * --release}. - *
      - * - *

      If none of these exist, an {@link UnsatisfiedLinkError} surfaces with the search list so the - * user knows what to build. + *

      The cdylib is bundled inside this jar at {@code + * org/apache/datafusion/examples///} (see the antrun execution in {@code + * examples/pom.xml}) and extracted/loaded once via the connector's {@link NativeLibraryLoader} — + * the same two-piece recipe (pom copy block + one loader call) a real bridge uses to ship its own + * cdylib. For local hacking against an unpackaged build, {@code + * -Dexample.ffi.lib.path=/abs/path/to/libdatafusion_java_ffi_example.dylib} bypasses the bundled + * copy. */ final class FfiTableProviderExampleNative { - private static final String LIBRARY_NAME = "datafusion_java_ffi_example"; - private FfiTableProviderExampleNative() {} static { - loadLibrary(); + String explicit = System.getProperty("example.ffi.lib.path"); + if (explicit != null && !explicit.isEmpty()) { + System.load(explicit); + } else { + NativeLibraryLoader.load( + FfiTableProviderExampleNative.class, + "org/apache/datafusion/examples", + "datafusion_java_ffi_example"); + } } /** @@ -69,46 +67,4 @@ private FfiTableProviderExampleNative() {} * providerSchemaIpc}) accepts the pointer it owns the box. */ static native void dropProvider(long ffiTableProviderPtr); - - private static void loadLibrary() { - String mapped = System.mapLibraryName(LIBRARY_NAME); - Path explicit = optionalPath(System.getProperty("example.ffi.lib.path")); - - // Cover both common cwds: repo root (mvn exec from datafusion-java/) and - // the examples module (mvn exec from datafusion-java/examples/). The - // workspace writes to `rust-target/` at the repo root. - Path[] candidates = - new Path[] { - explicit, - Paths.get("rust-target", "release", mapped), - Paths.get("rust-target", "debug", mapped), - Paths.get("..", "rust-target", "release", mapped), - Paths.get("..", "rust-target", "debug", mapped), - }; - - for (Path candidate : candidates) { - if (candidate != null && Files.exists(candidate)) { - System.load(candidate.toAbsolutePath().toString()); - return; - } - } - - StringBuilder searched = new StringBuilder(); - for (Path c : candidates) { - if (searched.length() > 0) searched.append(", "); - searched.append(c == null ? "null" : c.toAbsolutePath().toString()); - } - throw new UnsatisfiedLinkError( - String.format( - Locale.ROOT, - "Example native library %s not found. Searched: [%s]. " - + "Build with 'cargo build -p datafusion-java-ffi-example --release', or pass " - + "-Dexample.ffi.lib.path=.", - mapped, - searched)); - } - - private static Path optionalPath(String s) { - return (s == null || s.isEmpty()) ? null : Paths.get(s); - } } diff --git a/spark/README.md b/spark/README.md index ebd6cb9..f8af742 100644 --- a/spark/README.md +++ b/spark/README.md @@ -232,6 +232,93 @@ registered via a (this module registers `datafusion` the same way — see [`src/main/resources/META-INF/services/`](src/main/resources/META-INF/services/)). +## Packaging your bridge + +The end-user experience to aim for is one artifact: + +```python +# spark.jars (or --packages) gets exactly one jar, then: +df = spark.read.format("my_format").option("url", "...").load() +``` + +Three pieces make that work: + +**Bundle your cdylib inside the jar.** Copy it into your jar's resources at +`///` and load it from your native +class's static initializer with the connector's loader — no hand-rolled +extraction code: + +```java +static { + NativeLibraryLoader.load(BridgeNative.class, "com/example/mybridge", "my_bridge"); +} +``` + +The pom side is one antrun copy execution plus per-host profiles; the +examples module is a complete working copy of the pattern (see the +`copy-ffi-example-cdylib` execution and the `native-*` profiles in +[`examples/pom.xml`](../examples/pom.xml), and the loader call in +[`FfiTableProviderExampleNative.java`](../examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExampleNative.java)). +For a multi-platform jar, build the cdylib per platform in CI and copy each +into its own `//` directory before `mvn package` — the layout +supports them side by side. + +**Shade your dependencies into one fat jar** with `maven-shade-plugin`, so +users don't assemble a jar list: + +```xml + + org.apache.maven.plugins + maven-shade-plugin + + + package + shade + + + + + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + + + +``` + +Include in the shaded jar: this connector (`datafusion-java-spark`), the core +jar (`datafusion-java` — exception classes and, if you push predicates, the +generated proto classes), the Arrow Java artifacts you compile against, and +your own classes + cdylib. Keep `spark-sql`/`scala-library` `provided` — the +cluster supplies them. + +**Do NOT relocate JNI-bound or JNI-loading packages.** JNI binds native +methods by the class's fully-qualified name; `arrow-c-data` and the Arrow +memory modules likewise load their own natives. Relocating +`io.datafusion.spark`, `org.apache.arrow`, or your own native class breaks +the symbol lookup at runtime. Practical consequences: + +- Ship a plain (unrelocated) fat jar. Two bridges in one Spark app then share + one copy of the connector classes — fine when they're built against the + same connector version, which is the only configuration we support anyway + (their cdylibs stay distinct via per-bridge JNI class names). +- Spark bundles its own (often older) Arrow. Since yours can't be relocated + away, have users set `spark.executor.userClassPathFirst=true` and + `spark.driver.userClassPathFirst=true` (the pyspark demo under + [`examples/python/`](../examples/python/) shows the working incantation), + or build with Arrow pinned to the cluster's version. + ## Spark tasks vs. DataFusion partitions This is the most important design decision when building a connector, so it diff --git a/spark/src/main/java/io/datafusion/spark/NativeLibraryLoader.java b/spark/src/main/java/io/datafusion/spark/NativeLibraryLoader.java index 0b9fc22..9f330d0 100644 --- a/spark/src/main/java/io/datafusion/spark/NativeLibraryLoader.java +++ b/spark/src/main/java/io/datafusion/spark/NativeLibraryLoader.java @@ -25,29 +25,62 @@ import java.nio.file.Path; import java.nio.file.StandardCopyOption; import java.util.Locale; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; /** - * Extracts a cdylib bundled inside the connector-core jar to a temp file and loads it via {@link - * System#load}. Layout inside the jar: + * Extracts a cdylib bundled inside a jar to a temp file and loads it via {@link System#load}. + * Expected layout inside the jar: * *

      - *   io/datafusion/spark/<os>/<arch>/lib<name>.<ext>
      + *   <resourcePrefix>/<os>/<arch>/lib<name>.<ext>
        * 
      * * where {@code } is one of {@code linux}, {@code darwin}, {@code windows} and {@code } is * {@code x86_64} or {@code aarch64}. + * + *

      The connector loads its own cdylib through this class (prefix {@code io/datafusion/spark}); + * bridges are encouraged to reuse it via {@link #load(Class, String, String)} from their native + * class's static initializer, with their own resource prefix, instead of hand-rolling extraction. + * Bundle the cdylib with the same antrun-copy pattern the connector's pom uses (see "Packaging + * your bridge" in {@code spark/README.md}). */ -final class NativeLibraryLoader { +public final class NativeLibraryLoader { + + /** {@code /} entries already extracted and loaded by this classloader. */ + private static final Set LOADED = ConcurrentHashMap.newKeySet(); private NativeLibraryLoader() {} + /** Connector-internal entry: loads from the connector jar's own prefix. */ static void loadLibrary(String name) { + load(NativeLibraryLoader.class, "io/datafusion/spark", name); + } + + /** + * Extract {@code ///} from {@code anchor}'s classloader + * and {@link System#load} it. Idempotent per (prefix, name): repeated calls — e.g. one per + * Spark task instantiating the bridge's native class — load once. + * + * @param anchor class whose classloader holds the resource (the bridge's own native class, so + * the lookup works under Spark's per-application classloaders) + * @param resourcePrefix jar-internal directory, no leading or trailing slash (e.g. {@code + * "com/example/mybridge"}) + * @param name unmapped library name (e.g. {@code "my_bridge"} for {@code libmy_bridge.so}) + * @throws UnsatisfiedLinkError if the resource is missing or extraction fails + */ + public static void load(Class anchor, String resourcePrefix, String name) { + String key = resourcePrefix + "/" + name; + if (!LOADED.add(key)) { + return; + } String resource = String.format( - "/io/datafusion/spark/%s/%s/%s", - currentOs(), currentArch(), System.mapLibraryName(name)); - try (InputStream in = NativeLibraryLoader.class.getResourceAsStream(resource)) { + "/%s/%s/%s/%s", + resourcePrefix, currentOs(), currentArch(), System.mapLibraryName(name)); + try (InputStream in = anchor.getResourceAsStream(resource)) { if (in == null) { + LOADED.remove(key); throw new UnsatisfiedLinkError("Native library not found on classpath: " + resource); } Path tmp = Files.createTempFile("libdatafusion-spark-", "-" + System.mapLibraryName(name)); @@ -55,8 +88,12 @@ static void loadLibrary(String name) { Files.copy(in, tmp, StandardCopyOption.REPLACE_EXISTING); System.load(tmp.toAbsolutePath().toString()); } catch (IOException e) { + LOADED.remove(key); throw new UnsatisfiedLinkError( "Failed to extract native library " + resource + ": " + e.getMessage()); + } catch (RuntimeException | Error e) { + LOADED.remove(key); + throw e; } } From cc35958f6ab8017664dd41e0c753943ad2cbe43c Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 11 Jun 2026 17:04:41 +0200 Subject: [PATCH 16/22] feat(spark): bridge scaffold generator dev/new_bridge.py stamps a standalone bridge project from dev/bridge-template/: cdylib crate on the datafusion-spark-bridge SDK (no-default-features, working demo MemTable provider), the four Java classes, DataSourceRegister entry, shaded-jar pom with the cdylib bundled, pyspark smoke test, README. A new bridge is one Rust function away from spark.read.format("") with a single jar. Details that bite without the generator: JNI symbol mangling for the package (underscores need _1), Scala protected compiling to JVM-public (factoryFqcn override must be public), an empty [workspace] table so the generated crate survives workspace-rooted parent directories, and no shade relocations anywhere near JNI-bound classes. Also: dev/bridge-template/** excluded from RAT (generated projects are user code, not ASF-headered) and spotless fixes for earlier javadoc edits that only verify-phase builds had flagged. Verified by generating an 'acme' bridge, building cdylib + shaded jar, and scanning/filtering its demo table through PySpark. Co-Authored-By: Claude Fable 5 --- dev/bridge-template/.gitignore | 3 + dev/bridge-template/README.md | 54 ++++++ dev/bridge-template/native/Cargo.toml | 23 +++ dev/bridge-template/native/src/lib.rs | 59 ++++++ dev/bridge-template/pom.xml | 174 ++++++++++++++++++ dev/bridge-template/smoke_test.py | 55 ++++++ .../main/java/__PKG_PATH__/BridgeNative.java | 40 ++++ .../__PKG_PATH__/__PREFIX__DataSource.java | 21 +++ .../__PREFIX__ProviderFactory.java | 28 +++ .../__PKG_PATH__/__PREFIX__ScanBackend.java | 53 ++++++ ...pache.spark.sql.sources.DataSourceRegister | 1 + dev/new_bridge.py | 138 ++++++++++++++ .../FfiTableProviderExampleNative.java | 13 +- pom.xml | 3 + spark/README.md | 14 ++ .../io/datafusion/spark/FfiHelperNative.java | 7 +- .../datafusion/spark/FfiProviderFactory.java | 33 ++-- .../io/datafusion/spark/FfiScanBackend.java | 15 +- .../datafusion/spark/NativeLibraryLoader.java | 12 +- .../io/datafusion/spark/OptionsCodec.java | 7 +- .../io/datafusion/spark/PartitionInfo.java | 4 +- .../java/io/datafusion/spark/ScanBackend.java | 18 +- 22 files changed, 721 insertions(+), 54 deletions(-) create mode 100644 dev/bridge-template/.gitignore create mode 100644 dev/bridge-template/README.md create mode 100644 dev/bridge-template/native/Cargo.toml create mode 100644 dev/bridge-template/native/src/lib.rs create mode 100644 dev/bridge-template/pom.xml create mode 100644 dev/bridge-template/smoke_test.py create mode 100644 dev/bridge-template/src/main/java/__PKG_PATH__/BridgeNative.java create mode 100644 dev/bridge-template/src/main/java/__PKG_PATH__/__PREFIX__DataSource.java create mode 100644 dev/bridge-template/src/main/java/__PKG_PATH__/__PREFIX__ProviderFactory.java create mode 100644 dev/bridge-template/src/main/java/__PKG_PATH__/__PREFIX__ScanBackend.java create mode 100644 dev/bridge-template/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister create mode 100644 dev/new_bridge.py diff --git a/dev/bridge-template/.gitignore b/dev/bridge-template/.gitignore new file mode 100644 index 0000000..e2777a5 --- /dev/null +++ b/dev/bridge-template/.gitignore @@ -0,0 +1,3 @@ +target/ +native/target/ +*.class diff --git a/dev/bridge-template/README.md b/dev/bridge-template/README.md new file mode 100644 index 0000000..ff3e37f --- /dev/null +++ b/dev/bridge-template/README.md @@ -0,0 +1,54 @@ +# __PREFIX__ Spark Bridge + +A Spark DataSource V2 connector for the `__FORMAT__` format, built on the +[datafusion-java Spark connector](https://github.com/apache/datafusion-java) +and its `datafusion-spark-bridge` Rust SDK. Generated by `dev/new_bridge.py`; +the only code you need to touch is marked `TODO`. + +## What's here + +| File | Role | +| --- | --- | +| `native/src/lib.rs` | **Your provider.** `build_provider` turns option/partition bytes into a DataFusion `TableProvider` (demo: an in-memory table). `export_bridge!` generates the whole JNI surface. | +| `src/main/java/.../BridgeNative.java` | Declares the generated native methods and loads the bundled cdylib. Must keep the name/package the Rust macro was generated with. | +| `src/main/java/.../__PREFIX__ScanBackend.java` | Routes the connector's scan calls to `BridgeNative`. Pure delegation. | +| `src/main/java/.../__PREFIX__ProviderFactory.java` | The connector contract. Override `listPartitions` / `sharedScan` / `encodeOptions` here as the bridge grows. | +| `src/main/java/.../__PREFIX__DataSource.java` + `META-INF/services/...` | `spark.read.format("__FORMAT__")`. | +| `pom.xml` | One shaded fat jar with the cdylib bundled inside. | + +## Build + +```bash +# 0. Once: install datafusion-java to your local Maven repo (from its checkout): +# cargo build && ./mvnw install -DskipTests + +# 1. The cdylib: +cargo build --manifest-path native/Cargo.toml + +# 2. The shaded jar (target/__CRATE__-0.1.0-SNAPSHOT.jar): +mvn package +``` + +Release builds: `cargo build --release --manifest-path native/Cargo.toml` and +`mvn package -Dnative.profile=release`. + +## Use + +```python +df = (spark.read.format("__FORMAT__") + .option("rows", "5") # demo option; replace with your own + .load()) +df.show() +``` + +with the shaded jar on `spark.jars`. `python3 smoke_test.py` runs exactly this +against a local Spark (needs `SPARK_HOME` pointing at a Scala 2.13 distro). + +## Where to go next + +- Replace the demo `MemTable` in `native/src/lib.rs` with your real provider. +- Split the dataset into Spark tasks (`listPartitions`) or switch to + shared-scan mode (`sharedScan`) — task-sizing guidance lives in the + connector's `spark/README.md` ("Spark tasks vs. DataFusion partitions"). +- Multi-platform jars: build the cdylib per platform in CI and copy each into + `src`-side `//` directories before `mvn package`. diff --git a/dev/bridge-template/native/Cargo.toml b/dev/bridge-template/native/Cargo.toml new file mode 100644 index 0000000..b87a927 --- /dev/null +++ b/dev/bridge-template/native/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "__CRATE__-native" +version = "0.1.0" +edition = "2021" +publish = false + +# Standalone crate: the empty [workspace] table stops cargo from adopting +# this crate into any workspace found in a parent directory. +[workspace] + +[lib] +name = "__LIB__" +crate-type = ["cdylib"] + +[dependencies] +# default-features = false drops the datafusion-ffi import path — a static +# bridge never crosses an FFI_TableProvider boundary. +# TODO: replace the path with a git or crates.io dependency once you build +# outside a local datafusion-java checkout. +datafusion-spark-bridge = { path = "__BRIDGE_SDK_PATH__", default-features = false } + +[profile.release] +strip = "debuginfo" diff --git a/dev/bridge-template/native/src/lib.rs b/dev/bridge-template/native/src/lib.rs new file mode 100644 index 0000000..8439217 --- /dev/null +++ b/dev/bridge-template/native/src/lib.rs @@ -0,0 +1,59 @@ +//! Native side of the `__FORMAT__` Spark bridge. +//! +//! `export_bridge!` generates the whole JNI surface for +//! `__PKG__.BridgeNative`; the only code you own is [`build_provider`], +//! which turns the option/partition bytes your JVM factory encoded into a +//! concrete `TableProvider`. Everything downstream — type widening, session +//! construction, projection, pushed filters, planning, partition streams — +//! is the SDK's job. + +use std::sync::Arc; + +use datafusion_spark_bridge::datafusion::arrow::array::{Int64Array, StringArray}; +use datafusion_spark_bridge::datafusion::arrow::datatypes::{DataType, Field, Schema}; +use datafusion_spark_bridge::datafusion::arrow::record_batch::RecordBatch; +use datafusion_spark_bridge::datafusion::catalog::TableProvider; +use datafusion_spark_bridge::datafusion::datasource::MemTable; +use datafusion_spark_bridge::options::decode_options; +use datafusion_spark_bridge::{export_bridge, BridgeContext, JniResult}; + +/// Build the provider for one scan. +/// +/// `options` is whatever the JVM factory's `encodeOptions` produced — with +/// the default factory that is the connector's `OptionsCodec` format, decoded +/// below into a string map. `partition` is the per-task payload from +/// `listPartitions` (empty for the schema probe, for shared-scan mode, and +/// for the default single-partition layout). +/// +/// TODO: replace the demo `MemTable` with your real `TableProvider`. For +/// async construction (remote catalogs, object stores), use +/// `ctx.block_on(...)`. +fn build_provider( + _ctx: &BridgeContext, + options: &[u8], + _partition: &[u8], +) -> JniResult> { + let opts = decode_options(options)?; + let rows: i64 = match opts.get("rows") { + Some(v) => v + .parse() + .map_err(|e| format!("option 'rows' is not an integer: {e}"))?, + None => 3, + }; + + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int64, false), + Field::new("greeting", DataType::Utf8, false), + ])); + let ids = Int64Array::from_iter_values(0..rows); + let greetings = + StringArray::from_iter_values((0..rows).map(|i| format!("hello from __FORMAT__ #{i}"))); + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(ids), Arc::new(greetings)])?; + + Ok(Arc::new(MemTable::try_new(schema, vec![vec![batch]])?)) +} + +export_bridge! { + jni_class: "__JNI_CLASS__", + build_provider: build_provider, +} diff --git a/dev/bridge-template/pom.xml b/dev/bridge-template/pom.xml new file mode 100644 index 0000000..4e8c2b6 --- /dev/null +++ b/dev/bridge-template/pom.xml @@ -0,0 +1,174 @@ + + + 4.0.0 + + __PKG__ + __CRATE__ + 0.1.0-SNAPSHOT + jar + + __PREFIX__ Spark Bridge + + + UTF-8 + 17 + 2.13 + 3.5.7 + __DF_JAVA_VERSION__ + + debug + + + + + + org.apache.datafusion + datafusion-java-spark_${scala.compat.version} + ${datafusion.java.version} + + + org.apache.spark + spark-sql_${scala.compat.version} + ${spark.version} + provided + + + + + + + + org.apache.maven.plugins + maven-antrun-plugin + 3.1.0 + + + copy-bridge-cdylib + process-classes + run + + + + + + + + + + + + + + + + org.apache.maven.plugins + maven-shade-plugin + 3.6.0 + + + package + shade + + false + + + + org.scala-lang:scala-library + + + + + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + + + + + + + + + + native-linux-amd64 + + unixlinuxamd64 + + + linux + x86_64 + lib__LIB__.so + + + + native-linux-x86_64 + + unixlinuxx86_64 + + + linux + x86_64 + lib__LIB__.so + + + + native-linux-aarch64 + + unixlinuxaarch64 + + + linux + aarch64 + lib__LIB__.so + + + + native-mac-x86_64 + + macx86_64 + + + darwin + x86_64 + lib__LIB__.dylib + + + + native-mac-aarch64 + + macaarch64 + + + darwin + aarch64 + lib__LIB__.dylib + + + + diff --git a/dev/bridge-template/smoke_test.py b/dev/bridge-template/smoke_test.py new file mode 100644 index 0000000..ca3925e --- /dev/null +++ b/dev/bridge-template/smoke_test.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 +"""Smoke test: scan the __FORMAT__ bridge's demo table through PySpark. + +Prerequisites: + - cargo build --manifest-path native/Cargo.toml (the bridge cdylib) + - mvn package (the shaded jar) + - a Scala 2.13 Spark distribution; the PyPI pyspark wheel embeds 2.12, so + point SPARK_HOME at e.g. spark-3.5.7-bin-hadoop3-scala2.13. + +Run: python3 smoke_test.py +""" + +import glob +import os +import sys +from pathlib import Path + +PROJECT_ROOT = Path(__file__).resolve().parent + +spark_home = os.environ.get("SPARK_HOME") +if not spark_home or not Path(spark_home, "jars").is_dir(): + sys.exit("Set SPARK_HOME to a Scala 2.13 Spark distribution.") +os.environ["SPARK_HOME"] = spark_home + +jars = glob.glob(str(PROJECT_ROOT / "target" / "__CRATE__-*.jar")) +jars = [j for j in jars if not j.endswith(("-sources.jar", "-javadoc.jar"))] +if not jars: + sys.exit("Shaded jar not found under target/. Run 'mvn package' first.") +jar = jars[0] + +from pyspark.sql import SparkSession # noqa: E402 + +spark = ( + SparkSession.builder.appName("__FORMAT__-smoke") + .master("local[2]") + .config("spark.jars", jar) + # extraClassPath PREPENDS, so the fat jar's Arrow wins over Spark's + # bundled (older) copy on both driver and executors. + .config("spark.driver.extraClassPath", jar) + .config("spark.executor.extraClassPath", jar) + .config("spark.driver.extraJavaOptions", "--add-opens=java.base/java.nio=ALL-UNNAMED") + .config("spark.executor.extraJavaOptions", "--add-opens=java.base/java.nio=ALL-UNNAMED") + .getOrCreate() +) + +df = spark.read.format("__FORMAT__").option("rows", "5").load() +df.printSchema() +df.show(truncate=False) +count = df.count() +filtered = df.filter("id >= 2").count() +spark.stop() + +assert count == 5, f"expected 5 rows, got {count}" +assert filtered == 3, f"expected 3 rows with id >= 2, got {filtered}" +print("smoke test OK: 5 rows scanned, filter returned 3") diff --git a/dev/bridge-template/src/main/java/__PKG_PATH__/BridgeNative.java b/dev/bridge-template/src/main/java/__PKG_PATH__/BridgeNative.java new file mode 100644 index 0000000..7cf02de --- /dev/null +++ b/dev/bridge-template/src/main/java/__PKG_PATH__/BridgeNative.java @@ -0,0 +1,40 @@ +package __PKG__; + +import io.datafusion.spark.NativeLibraryLoader; + +/** + * JNI surface generated on the Rust side by {@code export_bridge!} with {@code jni_class = + * "__JNI_CLASS__"} — the mangled binary name of THIS class. Renaming or moving this class + * requires regenerating the Rust macro invocation to match. + * + *

      The cdylib is bundled in this jar under {@code __PKG_PATH__///} (see the antrun + * execution in pom.xml) and extracted/loaded once per JVM by the connector's loader. + */ +final class BridgeNative { + + private BridgeNative() {} + + static { + NativeLibraryLoader.load(BridgeNative.class, "__PKG_PATH__", "__LIB__"); + } + + static native byte[] providerSchemaIpc(byte[] options, byte[] partition); + + static native long createScan( + byte[] options, + byte[] partition, + int targetPartitions, + int batchSize, + String[] optionKeys, + String[] optionValues, + String[] projectionColumns, + byte[][] filterProtos); + + static native int partitionCount(long scanHandle); + + static native void executeStreamPartition(long scanHandle, int partition, long ffiStreamAddr); + + static native void executeStream(long scanHandle, long ffiStreamAddr); + + static native void closeScan(long scanHandle); +} diff --git a/dev/bridge-template/src/main/java/__PKG_PATH__/__PREFIX__DataSource.java b/dev/bridge-template/src/main/java/__PKG_PATH__/__PREFIX__DataSource.java new file mode 100644 index 0000000..c888a0d --- /dev/null +++ b/dev/bridge-template/src/main/java/__PKG_PATH__/__PREFIX__DataSource.java @@ -0,0 +1,21 @@ +package __PKG__; + +import io.datafusion.spark.DatafusionSource; +import org.apache.spark.sql.util.CaseInsensitiveStringMap; + +/** + * Gives the bridge its Spark format name: {@code spark.read.format("__FORMAT__")}. Registered via + * {@code META-INF/services/org.apache.spark.sql.sources.DataSourceRegister}. + */ +public class __PREFIX__DataSource extends DatafusionSource { + + @Override + public String shortName() { + return "__FORMAT__"; + } + + @Override + public String factoryFqcn(CaseInsensitiveStringMap options) { + return __PREFIX__ProviderFactory.class.getName(); + } +} diff --git a/dev/bridge-template/src/main/java/__PKG_PATH__/__PREFIX__ProviderFactory.java b/dev/bridge-template/src/main/java/__PKG_PATH__/__PREFIX__ProviderFactory.java new file mode 100644 index 0000000..25b572a --- /dev/null +++ b/dev/bridge-template/src/main/java/__PKG_PATH__/__PREFIX__ProviderFactory.java @@ -0,0 +1,28 @@ +package __PKG__; + +import io.datafusion.spark.FfiProviderFactory; +import io.datafusion.spark.ScanBackend; + +/** + * The bridge's contract with the Spark connector. This is a STATIC bridge — the provider is built + * inside this bridge's own cdylib — so the only required override is {@link #scanBackend()}. + * + *

      Useful optional overrides (see their javadoc on {@link FfiProviderFactory}): + * + *

        + *
      • {@code encodeOptions} — only if you have your own options schema; the default ships the + * Spark options map in the connector's {@code OptionsCodec} format, which the Rust side + * already decodes via {@code datafusion_spark_bridge::options::decode_options}. + *
      • {@code listPartitions} — the default is ONE whole-dataset partition. Override to split + * into more Spark tasks (with optional preferred hosts and partition keys), or… + *
      • {@code sharedScan} — …opt into shared-scan mode: one provider per executor, one Spark + * task per DataFusion output partition. Mind the determinism contract. + *
      + */ +public final class __PREFIX__ProviderFactory implements FfiProviderFactory { + + @Override + public ScanBackend scanBackend() { + return new __PREFIX__ScanBackend(); + } +} diff --git a/dev/bridge-template/src/main/java/__PKG_PATH__/__PREFIX__ScanBackend.java b/dev/bridge-template/src/main/java/__PKG_PATH__/__PREFIX__ScanBackend.java new file mode 100644 index 0000000..eb78dd1 --- /dev/null +++ b/dev/bridge-template/src/main/java/__PKG_PATH__/__PREFIX__ScanBackend.java @@ -0,0 +1,53 @@ +package __PKG__; + +import io.datafusion.spark.ScanBackend; + +/** Routes the connector's scan calls to this bridge's own cdylib. Pure delegation. */ +public final class __PREFIX__ScanBackend implements ScanBackend { + + @Override + public byte[] providerSchemaIpc(byte[] options, byte[] partitionBytes) { + return BridgeNative.providerSchemaIpc(options, partitionBytes); + } + + @Override + public long createScan( + byte[] options, + byte[] partitionBytes, + int targetPartitions, + int batchSize, + String[] optionKeys, + String[] optionValues, + String[] projectionColumns, + byte[][] filterProtos) { + return BridgeNative.createScan( + options, + partitionBytes, + targetPartitions, + batchSize, + optionKeys, + optionValues, + projectionColumns, + filterProtos); + } + + @Override + public int partitionCount(long scanHandle) { + return BridgeNative.partitionCount(scanHandle); + } + + @Override + public void executeStreamPartition(long scanHandle, int partition, long ffiStreamAddr) { + BridgeNative.executeStreamPartition(scanHandle, partition, ffiStreamAddr); + } + + @Override + public void executeStream(long scanHandle, long ffiStreamAddr) { + BridgeNative.executeStream(scanHandle, ffiStreamAddr); + } + + @Override + public void closeScan(long scanHandle) { + BridgeNative.closeScan(scanHandle); + } +} diff --git a/dev/bridge-template/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/dev/bridge-template/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister new file mode 100644 index 0000000..e72a178 --- /dev/null +++ b/dev/bridge-template/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister @@ -0,0 +1 @@ +__PKG__.__PREFIX__DataSource diff --git a/dev/new_bridge.py b/dev/new_bridge.py new file mode 100644 index 0000000..6e0b87a --- /dev/null +++ b/dev/new_bridge.py @@ -0,0 +1,138 @@ +#!/usr/bin/env python3 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Scaffold a new Spark bridge project from dev/bridge-template/. + +Stamps out a standalone project (Maven + Cargo) wired to the +datafusion-spark-bridge SDK: a Rust cdylib with `export_bridge!` and a demo +in-memory provider, the four Java classes (native surface, ScanBackend, +factory, DataSource shim), the DataSourceRegister service file, a shaded-jar +pom that bundles the cdylib, a pyspark smoke test, and a README with the +build/run commands. + +Usage: + python3 dev/new_bridge.py --name acme --package com.example.acme \ + [--output DIR] [--datafusion-java REPO_ROOT] + +`--name` is the Spark format short name (spark.read.format("acme")); it also +derives the class prefix (acme -> Acme, my_format -> MyFormat), the cargo +crate name, and the cdylib name. Stdlib only; no dependencies. +""" + +import argparse +import re +import sys +from pathlib import Path + +TEMPLATE_DIR = Path(__file__).resolve().parent / "bridge-template" + + +def jni_mangle(binary_class_name: str) -> str: + """JNI symbol mangling for a class's binary name: '_' -> '_1', '.' -> '_'.""" + return binary_class_name.replace("_", "_1").replace(".", "_") + + +def class_prefix(name: str) -> str: + return "".join(part.capitalize() for part in name.split("_")) + + +def validate(name: str, package: str) -> None: + if not re.fullmatch(r"[a-z][a-z0-9_]*", name): + sys.exit(f"--name must match [a-z][a-z0-9_]*, got: {name}") + if not re.fullmatch(r"[a-z][a-z0-9_]*(\.[a-z][a-z0-9_]*)+", package): + sys.exit(f"--package must be a dotted lowercase Java package, got: {package}") + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--name", required=True, help="Spark format short name, e.g. acme") + parser.add_argument( + "--package", required=True, help="Java package for the bridge, e.g. com.example.acme" + ) + parser.add_argument( + "--output", + help="Directory to create (default: ./-spark-bridge; must not exist)", + ) + parser.add_argument( + "--datafusion-java", + help="datafusion-java repo root providing the spark/bridge SDK crate " + "(default: the repo this script lives in)", + ) + args = parser.parse_args() + + validate(args.name, args.package) + prefix = class_prefix(args.name) + crate = args.name.replace("_", "-") + "-spark-bridge" + lib = args.name + "_spark_bridge" + repo = Path(args.datafusion_java).resolve() if args.datafusion_java else TEMPLATE_DIR.parents[1] + sdk_path = repo / "spark" / "bridge" + if not (sdk_path / "Cargo.toml").is_file(): + sys.exit(f"datafusion-spark-bridge crate not found at {sdk_path}") + out = Path(args.output) if args.output else Path.cwd() / crate + if out.exists(): + sys.exit(f"output directory already exists: {out}") + + tokens = { + "__PKG__": args.package, + "__PKG_PATH__": args.package.replace(".", "/"), + "__JNI_CLASS__": jni_mangle(args.package + ".BridgeNative"), + "__PREFIX__": prefix, + "__FORMAT__": args.name, + "__CRATE__": crate, + "__LIB__": lib, + "__BRIDGE_SDK_PATH__": str(sdk_path), + "__DF_JAVA_VERSION__": read_repo_version(repo), + } + + generated = [] + for src in sorted(TEMPLATE_DIR.rglob("*")): + if not src.is_file(): + continue + rel = str(src.relative_to(TEMPLATE_DIR)) + for token, value in tokens.items(): + rel = rel.replace(token, value) + dst = out / rel + dst.parent.mkdir(parents=True, exist_ok=True) + text = src.read_text() + for token, value in tokens.items(): + text = text.replace(token, value) + dst.write_text(text) + generated.append(rel) + + print(f"Generated {len(generated)} files under {out}:") + for rel in generated: + print(f" {rel}") + print() + print("Next steps (see the generated README.md):") + print(f" 1. cd {out}") + print(" 2. cargo build --release --manifest-path native/Cargo.toml") + print(" 3. mvn package -Dnative.profile=release") + print(f" 4. spark.read.format(\"{args.name}\") with the shaded jar on spark.jars") + + +def read_repo_version(repo: Path) -> str: + """datafusion-java's maven version, scraped from the parent pom.""" + pom = (repo / "pom.xml").read_text() + m = re.search(r"([^<]+)", pom) + if not m: + sys.exit(f"could not find in {repo}/pom.xml") + return m.group(1) + + +if __name__ == "__main__": + main() diff --git a/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExampleNative.java b/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExampleNative.java index a34e9c3..333fddf 100644 --- a/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExampleNative.java +++ b/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExampleNative.java @@ -26,13 +26,12 @@ * {@code MemTable}-backed {@code FFI_TableProvider} that {@link ExampleFfiProviderFactory} hands to * the Spark connector ({@code FfiHelperNative.createScan}). * - *

      The cdylib is bundled inside this jar at {@code - * org/apache/datafusion/examples///} (see the antrun execution in {@code - * examples/pom.xml}) and extracted/loaded once via the connector's {@link NativeLibraryLoader} — - * the same two-piece recipe (pom copy block + one loader call) a real bridge uses to ship its own - * cdylib. For local hacking against an unpackaged build, {@code - * -Dexample.ffi.lib.path=/abs/path/to/libdatafusion_java_ffi_example.dylib} bypasses the bundled - * copy. + *

      The cdylib is bundled inside this jar at {@code org/apache/datafusion/examples///} + * (see the antrun execution in {@code examples/pom.xml}) and extracted/loaded once via the + * connector's {@link NativeLibraryLoader} — the same two-piece recipe (pom copy block + one loader + * call) a real bridge uses to ship its own cdylib. For local hacking against an unpackaged build, + * {@code -Dexample.ffi.lib.path=/abs/path/to/libdatafusion_java_ffi_example.dylib} bypasses the + * bundled copy. */ final class FfiTableProviderExampleNative { diff --git a/pom.xml b/pom.xml index 8ff131c..8bf1558 100644 --- a/pom.xml +++ b/pom.xml @@ -188,6 +188,9 @@ under the License. **/META-INF/services/** dev/release/rat_exclude_files.txt + + dev/bridge-template/** diff --git a/spark/README.md b/spark/README.md index f8af742..cd3443f 100644 --- a/spark/README.md +++ b/spark/README.md @@ -44,6 +44,20 @@ Everything DataFusion-side (planning, filter application, execution) happens inside the connector's native library. There is no DataFusion session on the JVM side at all. +## Getting started: generate a bridge + +Don't hand-assemble the pieces below — stamp them out: + +```bash +python3 dev/new_bridge.py --name acme --package com.example.acme +``` + +generates a standalone project (Rust cdylib with a working demo provider, +the four Java classes, service registration, shaded-jar pom with the cdylib +bundled, pyspark smoke test, README with the build commands). Replace the +demo `MemTable` in its `native/src/lib.rs` and you have a connector. The +sections below explain what each generated piece is for. + ## What you implement | # | Piece | Language | Contract lives at | Working example | diff --git a/spark/src/main/java/io/datafusion/spark/FfiHelperNative.java b/spark/src/main/java/io/datafusion/spark/FfiHelperNative.java index 2524c61..fe93fb6 100644 --- a/spark/src/main/java/io/datafusion/spark/FfiHelperNative.java +++ b/spark/src/main/java/io/datafusion/spark/FfiHelperNative.java @@ -59,8 +59,8 @@ private FfiHelperNative() {} *

      Takes ownership of {@code ffiProviderRawPtr}. {@code targetPartitions} / {@code batchSize} * {@code <= 0} leave the DataFusion defaults; {@code optionKeys}/{@code optionValues} are * parallel arrays of DataFusion config overrides; an empty {@code projectionColumns} selects all - * columns; each element of {@code filterProtos} is a serialized {@code datafusion.LogicalExprNode} - * applied as a filter. + * columns; each element of {@code filterProtos} is a serialized {@code + * datafusion.LogicalExprNode} applied as a filter. * *

      The caller owns the returned handle and must pair it with {@link #closeScan(long)}. Closing * while a stream opened from this handle is still in flight is undefined behaviour — the @@ -82,7 +82,8 @@ public static native long createScan( * Open an independent stream over ONE plan partition, writing an {@code FFI_ArrowArrayStream} * into the caller-allocated struct at {@code ffiStreamAddr}. Concurrent-safe across JVM threads. */ - public static native void executeStreamPartition(long scanHandle, int partition, long ffiStreamAddr); + public static native void executeStreamPartition( + long scanHandle, int partition, long ffiStreamAddr); /** * Stream the WHOLE plan (all partitions coalesced) into the caller-allocated {@code diff --git a/spark/src/main/java/io/datafusion/spark/FfiProviderFactory.java b/spark/src/main/java/io/datafusion/spark/FfiProviderFactory.java index 74b00e2..5d64a23 100644 --- a/spark/src/main/java/io/datafusion/spark/FfiProviderFactory.java +++ b/spark/src/main/java/io/datafusion/spark/FfiProviderFactory.java @@ -30,9 +30,9 @@ * *

        *
      • Static bridge (preferred when the provider's Rust source is yours): the cdylib is - * built with {@code datafusion_spark_bridge::export_bridge!} and constructs the provider - * from the options/partition bytes natively. Override {@link #scanBackend()} to delegate to - * the JNI class named in the macro; {@link #createProvider(byte[], byte[])} is never called. + * built with {@code datafusion_spark_bridge::export_bridge!} and constructs the provider from + * the options/partition bytes natively. Override {@link #scanBackend()} to delegate to the + * JNI class named in the macro; {@link #createProvider(byte[], byte[])} is never called. *
      • FFI bridge (the provider arrives precompiled, or must stay on a different DataFusion * version): override {@link #createProvider(byte[], byte[])} to return a raw {@code * FFI_TableProvider} pointer; the default {@link #scanBackend()} routes it through the @@ -77,14 +77,12 @@ default byte[] encodeOptions(Map sparkOptions) { * no preference. * *

        Default: one partition ({@code "p0"}, empty payload, no host preference) — one Spark task - * scans the whole dataset. Fine for small tables and first bring-up; override (or opt into - * {@link #sharedScan(byte[])}) before pointing it at anything large. Size guidance lives in - * {@code spark/README.md}. + * scans the whole dataset. Fine for small tables and first bring-up; override (or opt into {@link + * #sharedScan(byte[])}) before pointing it at anything large. Size guidance lives in {@code + * spark/README.md}. */ default PartitionInfo[] listPartitions(byte[] optionsProtoBytes) { - return new PartitionInfo[] { - new PartitionInfo("p0", new byte[0], new String[0]) - }; + return new PartitionInfo[] {new PartitionInfo("p0", new byte[0], new String[0])}; } /** @@ -158,15 +156,14 @@ default long createProvider(byte[] optionsProtoBytes, byte[] partitionBytes) { } /** - * The native scan implementation this bridge talks to. Called wherever the connector needs - * native work — driver-side schema/plan probes and executor-side streams — always on a factory - * freshly instantiated from its class name, so the returned backend never has to be - * serializable. - * - *

        Default: the generic FFI path ({@link FfiScanBackend} over {@link - * #createProvider(byte[], byte[])} and the connector's own cdylib). Static bridges built with - * {@code datafusion_spark_bridge::export_bridge!} override this to return a backend that loads - * their cdylib and delegates each method to the JNI class named in the macro invocation. + * The native scan implementation this bridge talks to. Called wherever the connector needs native + * work — driver-side schema/plan probes and executor-side streams — always on a factory freshly + * instantiated from its class name, so the returned backend never has to be serializable. + * + *

        Default: the generic FFI path ({@link FfiScanBackend} over {@link #createProvider(byte[], + * byte[])} and the connector's own cdylib). Static bridges built with {@code + * datafusion_spark_bridge::export_bridge!} override this to return a backend that loads their + * cdylib and delegates each method to the JNI class named in the macro invocation. */ default ScanBackend scanBackend() { return new FfiScanBackend(this); diff --git a/spark/src/main/java/io/datafusion/spark/FfiScanBackend.java b/spark/src/main/java/io/datafusion/spark/FfiScanBackend.java index f1b6ad2..2da91fb 100644 --- a/spark/src/main/java/io/datafusion/spark/FfiScanBackend.java +++ b/spark/src/main/java/io/datafusion/spark/FfiScanBackend.java @@ -20,10 +20,10 @@ package io.datafusion.spark; /** - * Generic FFI {@link ScanBackend}: asks the factory for a raw {@code FFI_TableProvider} pointer - * and routes everything through the connector's own cdylib ({@link FfiHelperNative}). This is the - * {@link FfiProviderFactory#scanBackend()} default; bridges that statically link their provider - * via {@code export_bridge!} replace it with a backend delegating to their own native class. + * Generic FFI {@link ScanBackend}: asks the factory for a raw {@code FFI_TableProvider} pointer and + * routes everything through the connector's own cdylib ({@link FfiHelperNative}). This is the + * {@link FfiProviderFactory#scanBackend()} default; bridges that statically link their provider via + * {@code export_bridge!} replace it with a backend delegating to their own native class. */ public final class FfiScanBackend implements ScanBackend { @@ -51,7 +51,12 @@ public long createScan( byte[][] filterProtos) { long ptr = factory.createProvider(options, partitionBytes); return FfiHelperNative.createScan( - ptr, targetPartitions, batchSize, optionKeys, optionValues, projectionColumns, + ptr, + targetPartitions, + batchSize, + optionKeys, + optionValues, + projectionColumns, filterProtos); } diff --git a/spark/src/main/java/io/datafusion/spark/NativeLibraryLoader.java b/spark/src/main/java/io/datafusion/spark/NativeLibraryLoader.java index 9f330d0..6c7ecd5 100644 --- a/spark/src/main/java/io/datafusion/spark/NativeLibraryLoader.java +++ b/spark/src/main/java/io/datafusion/spark/NativeLibraryLoader.java @@ -42,8 +42,8 @@ *

        The connector loads its own cdylib through this class (prefix {@code io/datafusion/spark}); * bridges are encouraged to reuse it via {@link #load(Class, String, String)} from their native * class's static initializer, with their own resource prefix, instead of hand-rolling extraction. - * Bundle the cdylib with the same antrun-copy pattern the connector's pom uses (see "Packaging - * your bridge" in {@code spark/README.md}). + * Bundle the cdylib with the same antrun-copy pattern the connector's pom uses (see "Packaging your + * bridge" in {@code spark/README.md}). */ public final class NativeLibraryLoader { @@ -59,11 +59,11 @@ static void loadLibrary(String name) { /** * Extract {@code ///} from {@code anchor}'s classloader - * and {@link System#load} it. Idempotent per (prefix, name): repeated calls — e.g. one per - * Spark task instantiating the bridge's native class — load once. + * and {@link System#load} it. Idempotent per (prefix, name): repeated calls — e.g. one per Spark + * task instantiating the bridge's native class — load once. * - * @param anchor class whose classloader holds the resource (the bridge's own native class, so - * the lookup works under Spark's per-application classloaders) + * @param anchor class whose classloader holds the resource (the bridge's own native class, so the + * lookup works under Spark's per-application classloaders) * @param resourcePrefix jar-internal directory, no leading or trailing slash (e.g. {@code * "com/example/mybridge"}) * @param name unmapped library name (e.g. {@code "my_bridge"} for {@code libmy_bridge.so}) diff --git a/spark/src/main/java/io/datafusion/spark/OptionsCodec.java b/spark/src/main/java/io/datafusion/spark/OptionsCodec.java index 092ec2a..9b1feca 100644 --- a/spark/src/main/java/io/datafusion/spark/OptionsCodec.java +++ b/spark/src/main/java/io/datafusion/spark/OptionsCodec.java @@ -32,8 +32,8 @@ * *

        Layout (all integers big-endian {@code int32}): entry count, then per entry key length, key * bytes, value length, value bytes. Key-sorting makes the bytes a pure function of the map's - * contents regardless of source iteration order — required by the shared-scan determinism - * contract, where the options bytes are the cache/plan identity. + * contents regardless of source iteration order — required by the shared-scan determinism contract, + * where the options bytes are the cache/plan identity. * *

        The Rust decoder lives in {@code datafusion_spark_bridge::options}; bridges using the default * {@code encodeOptions} read their options there as a {@code BTreeMap}. The two @@ -45,8 +45,7 @@ private OptionsCodec() {} /** Encode {@code options} sorted by key. {@code null} or empty map encodes as count 0. */ public static byte[] encode(Map options) { - TreeMap sorted = - options == null ? new TreeMap<>() : new TreeMap<>(options); + TreeMap sorted = options == null ? new TreeMap<>() : new TreeMap<>(options); ByteArrayOutputStream out = new ByteArrayOutputStream(); writeInt(out, sorted.size()); for (Map.Entry e : sorted.entrySet()) { diff --git a/spark/src/main/java/io/datafusion/spark/PartitionInfo.java b/spark/src/main/java/io/datafusion/spark/PartitionInfo.java index 522d4e2..1dea72e 100644 --- a/spark/src/main/java/io/datafusion/spark/PartitionInfo.java +++ b/spark/src/main/java/io/datafusion/spark/PartitionInfo.java @@ -28,8 +28,8 @@ *

        Fields: * *

          - *
        • {@code id} — stable, human-readable identifier for this partition (e.g. a segment - * id). Surfaces in Spark UI, logs, and exception messages. Must be non-empty. + *
        • {@code id} — stable, human-readable identifier for this partition (e.g. a segment id). + * Surfaces in Spark UI, logs, and exception messages. Must be non-empty. *
        • {@code partitionBytes} — opaque per-partition payload. Bridge encodes whatever the executor * needs to materialise *this* slice (offsets, row ranges, sub-options, etc.). Combined with * the global {@code optionsProtoBytes} in {@link FfiProviderFactory#createProvider(byte[], diff --git a/spark/src/main/java/io/datafusion/spark/ScanBackend.java b/spark/src/main/java/io/datafusion/spark/ScanBackend.java index 4bc9bc4..21b9873 100644 --- a/spark/src/main/java/io/datafusion/spark/ScanBackend.java +++ b/spark/src/main/java/io/datafusion/spark/ScanBackend.java @@ -20,9 +20,9 @@ package io.datafusion.spark; /** - * Native scan surface the connector plumbing talks to. One method per JNI entry point of the - * {@code datafusion-spark-bridge} scan machinery; implementations only differ in which - * native library and class the calls land on: + * Native scan surface the connector plumbing talks to. One method per JNI entry point of the {@code + * datafusion-spark-bridge} scan machinery; implementations only differ in which native + * library and class the calls land on: * *
            *
          • {@link FfiScanBackend} (the {@link FfiProviderFactory#scanBackend()} default) builds the @@ -30,8 +30,8 @@ * the connector's own cdylib ({@link FfiHelperNative}) — the generic FFI path. *
          • A static bridge supplies its own implementation delegating to the class it named in its * {@code export_bridge!} invocation, whose generated {@code createScan} builds the provider - * from {@code options}/{@code partitionBytes} directly — no pointer handover, no - * {@code datafusion-ffi}. + * from {@code options}/{@code partitionBytes} directly — no pointer handover, no {@code + * datafusion-ffi}. *
          * *

          Implementations must be stateless or thread-safe: the driver probes schemas and plans through @@ -42,8 +42,8 @@ public interface ScanBackend { /** - * Driver-side schema probe: the widened Arrow schema of the provider described by {@code - * options} + {@code partitionBytes}, serialized as Arrow IPC bytes (deserialize with {@code + * Driver-side schema probe: the widened Arrow schema of the provider described by {@code options} + * + {@code partitionBytes}, serialized as Arrow IPC bytes (deserialize with {@code * MessageSerializer.deserializeSchema}). */ byte[] providerSchemaIpc(byte[] options, byte[] partitionBytes); @@ -51,8 +51,8 @@ public interface ScanBackend { /** * Build a planned scan and return its handle. {@code targetPartitions}/{@code batchSize} {@code * <= 0} leave DataFusion defaults; {@code optionKeys}/{@code optionValues} are parallel config - * override arrays; empty {@code projectionColumns} selects all columns; each {@code - * filterProtos} element is a serialized {@code datafusion.LogicalExprNode}. + * override arrays; empty {@code projectionColumns} selects all columns; each {@code filterProtos} + * element is a serialized {@code datafusion.LogicalExprNode}. * *

          The caller owns the handle and must pair it with {@link #closeScan(long)}. Closing while a * stream opened from the handle is in flight is undefined behaviour — the shared-scan cache's From 9e47f0e192e2ad345124bd859025820742df36f0 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 11 Jun 2026 17:55:11 +0200 Subject: [PATCH 17/22] refactor(spark)!: remove the FFI provider path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every known bridge owns its provider's Rust source, so the FFI_TableProvider handover was speculative generality with real costs: a second cdylib bundled in the connector jar, datafusion-ffi ABI lockstep between artifacts, and pointer-ownership rules across JNI. Static export_bridge! bridges are now the only path. - delete the datafusion-spark-helper cdylib, FfiHelperNative, FfiScanBackend, and the bridge SDK's ffi module + feature; datafusion-ffi leaves the dependency tree entirely - the connector jar goes pure JVM: no cargo prerequisite, no per-platform builds; native code ships inside each bridge's jar - convert examples/native to an export_bridge! bridge (datafusion-java-example-bridge) — the committed, runnable concrete-provider example; demo renamed to bridge_demo.py BREAKING CHANGE: FfiProviderFactory is renamed BridgeProviderFactory; createProvider is gone and scanBackend() is the single required method. Re-adding an FFI path later is mechanical: the ScanBackend seam and scan.rs's provider-source closure are unchanged, and the deleted code sits intact in this branch's history. Verified: cargo + mvn suites, spotless/RAT verify, pyspark demo (both scan modes), and a regenerated scaffold built + smoke-tested. Co-Authored-By: Claude Fable 5 --- Cargo.lock | 306 +++--------------- Cargo.toml | 2 - dev/bridge-template/native/Cargo.toml | 4 +- .../__PREFIX__ProviderFactory.java | 10 +- docs/source/contributor-guide/development.md | 9 +- examples/README.md | 19 +- examples/native/Cargo.toml | 12 +- examples/native/src/lib.rs | 133 ++------ examples/pom.xml | 16 +- examples/python/README.md | 26 +- ..._table_provider_demo.py => bridge_demo.py} | 20 +- .../examples/ExampleBridgeNative.java | 68 ++++ ...java => ExampleBridgeProviderFactory.java} | 35 +- .../examples/ExampleScanBackend.java | 34 +- .../FfiTableProviderExampleNative.java | 69 ---- spark/README.md | 183 ++++------- spark/bridge/Cargo.toml | 8 - spark/bridge/src/ffi.rs | 43 --- spark/bridge/src/lib.rs | 24 +- spark/bridge/src/options.rs | 2 +- spark/native/Cargo.toml | 25 -- spark/native/src/lib.rs | 104 ------ spark/pom.xml | 116 +------ ...actory.java => BridgeProviderFactory.java} | 86 ++--- .../io/datafusion/spark/FfiHelperNative.java | 97 ------ .../datafusion/spark/NativeLibraryLoader.java | 13 +- .../io/datafusion/spark/OptionsCodec.java | 4 +- .../io/datafusion/spark/PartitionInfo.java | 8 +- .../spark/ReportedPartitioning.java | 4 +- .../java/io/datafusion/spark/ScanBackend.java | 17 +- .../io/datafusion/spark/DatafusionBatch.scala | 2 +- .../DatafusionColumnarPartitionReader.scala | 6 +- .../spark/DatafusionInputPartition.scala | 4 +- .../io/datafusion/spark/DatafusionScan.scala | 4 +- .../spark/DatafusionScanBuilder.scala | 8 +- .../datafusion/spark/DatafusionSource.scala | 6 +- .../spark/NativeSharedScanResources.scala | 2 +- .../spark/PinnedSessionConfig.scala | 2 +- .../spark/SharedScanPartitionReader.scala | 2 +- ...> BridgeProviderFactoryDefaultsTest.scala} | 44 ++- 40 files changed, 383 insertions(+), 1194 deletions(-) rename examples/python/{ffi_table_provider_demo.py => bridge_demo.py} (92%) create mode 100644 examples/src/main/java/org/apache/datafusion/examples/ExampleBridgeNative.java rename examples/src/main/java/org/apache/datafusion/examples/{ExampleFfiProviderFactory.java => ExampleBridgeProviderFactory.java} (81%) rename spark/src/main/java/io/datafusion/spark/FfiScanBackend.java => examples/src/main/java/org/apache/datafusion/examples/ExampleScanBackend.java (59%) delete mode 100644 examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExampleNative.java delete mode 100644 spark/bridge/src/ffi.rs delete mode 100644 spark/native/Cargo.toml delete mode 100644 spark/native/src/lib.rs rename spark/src/main/java/io/datafusion/spark/{FfiProviderFactory.java => BridgeProviderFactory.java} (64%) delete mode 100644 spark/src/main/java/io/datafusion/spark/FfiHelperNative.java rename spark/src/test/scala/io/datafusion/spark/{FfiProviderFactoryDefaultsTest.scala => BridgeProviderFactoryDefaultsTest.scala} (68%) diff --git a/Cargo.lock b/Cargo.lock index ce9e1a0..286f96f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,54 +2,6 @@ # It is not intended for manual editing. version = 4 -[[package]] -name = "abi_stable" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69d6512d3eb05ffe5004c59c206de7f99c34951504056ce23fc953842f12c445" -dependencies = [ - "abi_stable_derive", - "abi_stable_shared", - "const_panic", - "core_extensions", - "crossbeam-channel", - "generational-arena", - "libloading", - "lock_api", - "parking_lot", - "paste", - "repr_offset", - "rustc_version", - "serde", - "serde_derive", - "serde_json", -] - -[[package]] -name = "abi_stable_derive" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7178468b407a4ee10e881bc7a328a65e739f0863615cca4429d43916b05e898" -dependencies = [ - "abi_stable_shared", - "as_derive_utils", - "core_extensions", - "proc-macro2", - "quote", - "rustc_version", - "syn 1.0.109", - "typed-arena", -] - -[[package]] -name = "abi_stable_shared" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2b5df7688c123e63f4d4d649cba63f2967ba7f7861b1664fca3f77d3dad2b63" -dependencies = [ - "core_extensions", -] - [[package]] name = "adler2" version = "2.0.1" @@ -390,18 +342,6 @@ dependencies = [ "regex-syntax", ] -[[package]] -name = "as_derive_utils" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff3c96645900a44cf11941c111bd08a6573b0e2f9f69bc9264b179d8fae753c4" -dependencies = [ - "core_extensions", - "proc-macro2", - "quote", - "syn 1.0.109", -] - [[package]] name = "async-compression" version = "0.4.42" @@ -414,15 +354,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "async-ffi" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4de21c0feef7e5a556e51af767c953f0501f7f300ba785cc99c47bdc8081a50" -dependencies = [ - "abi_stable", -] - [[package]] name = "async-recursion" version = "1.1.1" @@ -431,7 +362,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -442,7 +373,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -546,7 +477,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.117", + "syn", ] [[package]] @@ -730,15 +661,6 @@ dependencies = [ "tiny-keccak", ] -[[package]] -name = "const_panic" -version = "0.2.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e262cdaac42494e3ae34c43969f9cdeb7da178bdb4b66fa6a1ea2edb4c8ae652" -dependencies = [ - "typewit", -] - [[package]] name = "constant_time_eq" version = "0.4.2" @@ -761,21 +683,6 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" -[[package]] -name = "core_extensions" -version = "1.5.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42bb5e5d0269fd4f739ea6cedaf29c16d81c27a7ce7582008e90eb50dcd57003" -dependencies = [ - "core_extensions_proc_macros", -] - -[[package]] -name = "core_extensions_proc_macros" -version = "1.5.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "533d38ecd2709b7608fb8e18e4504deb99e9a72879e6aa66373a76d8dc4259ea" - [[package]] name = "cpufeatures" version = "0.2.17" @@ -803,15 +710,6 @@ dependencies = [ "cfg-if", ] -[[package]] -name = "crossbeam-channel" -version = "0.5.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2" -dependencies = [ - "crossbeam-utils", -] - [[package]] name = "crossbeam-utils" version = "0.8.21" @@ -875,7 +773,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.117", + "syn", ] [[package]] @@ -886,7 +784,7 @@ checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" dependencies = [ "darling_core", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -1265,36 +1163,6 @@ dependencies = [ "paste", ] -[[package]] -name = "datafusion-ffi" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b95173344d04ba62755c949bf44f8d1a6e4414cf6392a635db96c07e711b9a3c" -dependencies = [ - "abi_stable", - "arrow", - "arrow-schema", - "async-ffi", - "async-trait", - "datafusion-catalog", - "datafusion-common", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-functions-aggregate-common", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-proto", - "datafusion-proto-common", - "datafusion-session", - "futures", - "log", - "prost", - "semver", - "tokio", -] - [[package]] name = "datafusion-functions" version = "53.1.0" @@ -1432,13 +1300,12 @@ dependencies = [ ] [[package]] -name = "datafusion-java-ffi-example" +name = "datafusion-java-example-bridge" version = "0.1.0" dependencies = [ "arrow", "datafusion", - "datafusion-ffi", - "jni", + "datafusion-spark-bridge", "tokio", ] @@ -1481,7 +1348,7 @@ checksum = "2e367e6a71051d0ebdd29b2f85d12059b38b1d1f172c6906e80016da662226bd" dependencies = [ "datafusion-doc", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -1688,7 +1555,6 @@ dependencies = [ "arrow", "async-trait", "datafusion", - "datafusion-ffi", "datafusion-jni-common", "datafusion-proto", "futures", @@ -1697,14 +1563,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "datafusion-spark-helper" -version = "0.1.0" -dependencies = [ - "datafusion-spark-bridge", - "jni", -] - [[package]] name = "datafusion-sql" version = "53.1.0" @@ -1763,7 +1621,7 @@ checksum = "1ac70aa55017e108007fbaf5aa0f54b021c98f92ff8af59d42eda9da96e3dd4f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -1916,7 +1774,7 @@ checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -1948,15 +1806,6 @@ dependencies = [ "slab", ] -[[package]] -name = "generational-arena" -version = "0.2.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "877e94aff08e743b651baaea359664321055749b398adff8740a7399af7796e7" -dependencies = [ - "cfg-if", -] - [[package]] name = "generic-array" version = "0.14.7" @@ -2413,7 +2262,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "38c0b942f458fe50cdac086d2f946512305e5631e720728f2a61aabcd47a6264" dependencies = [ "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -2512,16 +2361,6 @@ version = "0.2.186" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" -[[package]] -name = "libloading" -version = "0.7.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" -dependencies = [ - "cfg-if", - "winapi", -] - [[package]] name = "liblzma" version = "0.4.6" @@ -2916,7 +2755,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" dependencies = [ "proc-macro2", - "syn 2.0.117", + "syn", ] [[package]] @@ -2953,7 +2792,7 @@ dependencies = [ "prost", "prost-types", "regex", - "syn 2.0.117", + "syn", "tempfile", ] @@ -2967,7 +2806,7 @@ dependencies = [ "itertools", "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -3217,7 +3056,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" dependencies = [ "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -3274,15 +3113,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "repr_offset" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb1070755bd29dffc19d0971cab794e607839ba2ef4b69a9e6fbc8733c1b72ea" -dependencies = [ - "tstr", -] - [[package]] name = "reqwest" version = "0.12.28" @@ -3465,7 +3295,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.117", + "syn", ] [[package]] @@ -3550,7 +3380,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -3561,7 +3391,7 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -3586,7 +3416,7 @@ dependencies = [ "proc-macro2", "quote", "serde", - "syn 2.0.117", + "syn", ] [[package]] @@ -3696,7 +3526,7 @@ checksum = "a6dd45d8fc1c79299bfbb7190e42ccbbdf6a5f52e4a6ad98d92357ea965bd289" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -3739,7 +3569,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -3763,7 +3593,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "syn 2.0.117", + "syn", "typify", "walkdir", ] @@ -3774,17 +3604,6 @@ version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" -[[package]] -name = "syn" -version = "1.0.109" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - [[package]] name = "syn" version = "2.0.117" @@ -3813,7 +3632,7 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -3855,7 +3674,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -3866,7 +3685,7 @@ checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -3937,7 +3756,7 @@ checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -4051,7 +3870,7 @@ checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -4069,45 +3888,18 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" -[[package]] -name = "tstr" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f8e0294f14baae476d0dd0a2d780b2e24d66e349a9de876f5126777a37bdba7" -dependencies = [ - "tstr_proc_macros", -] - -[[package]] -name = "tstr_proc_macros" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e78122066b0cb818b8afd08f7ed22f7fdbc3e90815035726f0840d0d26c0747a" - [[package]] name = "twox-hash" version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c" -[[package]] -name = "typed-arena" -version = "2.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6af6ae20167a9ece4bcb41af5b80f8a1f1df981f6391189ce00fd257af04126a" - [[package]] name = "typenum" version = "1.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6f5e870be6c3b371b77fe0ee0bafb859fa4964b4404c27de1d380043c4dda20" -[[package]] -name = "typewit" -version = "1.15.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "214ca0b2191785cbc06209b9ca1861e048e39b5ba33574b3cedd58363d5bb5f6" - [[package]] name = "typify" version = "0.5.0" @@ -4133,7 +3925,7 @@ dependencies = [ "semver", "serde", "serde_json", - "syn 2.0.117", + "syn", "thiserror 2.0.18", "unicode-ident", ] @@ -4151,7 +3943,7 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 2.0.117", + "syn", "typify-impl", ] @@ -4312,7 +4104,7 @@ dependencies = [ "bumpalo", "proc-macro2", "quote", - "syn 2.0.117", + "syn", "wasm-bindgen-shared", ] @@ -4392,22 +4184,6 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "winapi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" -dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", -] - -[[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" - [[package]] name = "winapi-util" version = "0.1.11" @@ -4417,12 +4193,6 @@ dependencies = [ "windows-sys 0.61.2", ] -[[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" - [[package]] name = "windows-core" version = "0.62.2" @@ -4444,7 +4214,7 @@ checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -4455,7 +4225,7 @@ checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -4740,7 +4510,7 @@ dependencies = [ "heck", "indexmap", "prettyplease", - "syn 2.0.117", + "syn", "wasm-metadata", "wit-bindgen-core", "wit-component", @@ -4756,7 +4526,7 @@ dependencies = [ "prettyplease", "proc-macro2", "quote", - "syn 2.0.117", + "syn", "wit-bindgen-core", "wit-bindgen-rust", ] @@ -4823,7 +4593,7 @@ checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", "synstructure", ] @@ -4844,7 +4614,7 @@ checksum = "1ae7f38b72ec2a254e2b87ef277cf2cd4fb97cbebf944faa6f33354da0867930" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] @@ -4864,7 +4634,7 @@ checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", "synstructure", ] @@ -4904,7 +4674,7 @@ checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 0597e55..be906aa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,7 +22,6 @@ members = [ "native-common", "examples/native", "spark/bridge", - "spark/native", ] # Every dependency used by any workspace member is declared here so version @@ -34,7 +33,6 @@ members = [ arrow = { version = "58", features = ["ffi"] } async-trait = "0.1" datafusion = { version = "53.1.0" } -datafusion-ffi = "53.1.0" datafusion-proto = "53.1.0" datafusion-substrait = "53.1.0" futures = "0.3" diff --git a/dev/bridge-template/native/Cargo.toml b/dev/bridge-template/native/Cargo.toml index b87a927..c0d2996 100644 --- a/dev/bridge-template/native/Cargo.toml +++ b/dev/bridge-template/native/Cargo.toml @@ -13,11 +13,9 @@ name = "__LIB__" crate-type = ["cdylib"] [dependencies] -# default-features = false drops the datafusion-ffi import path — a static -# bridge never crosses an FFI_TableProvider boundary. # TODO: replace the path with a git or crates.io dependency once you build # outside a local datafusion-java checkout. -datafusion-spark-bridge = { path = "__BRIDGE_SDK_PATH__", default-features = false } +datafusion-spark-bridge = { path = "__BRIDGE_SDK_PATH__" } [profile.release] strip = "debuginfo" diff --git a/dev/bridge-template/src/main/java/__PKG_PATH__/__PREFIX__ProviderFactory.java b/dev/bridge-template/src/main/java/__PKG_PATH__/__PREFIX__ProviderFactory.java index 25b572a..03498e4 100644 --- a/dev/bridge-template/src/main/java/__PKG_PATH__/__PREFIX__ProviderFactory.java +++ b/dev/bridge-template/src/main/java/__PKG_PATH__/__PREFIX__ProviderFactory.java @@ -1,13 +1,13 @@ package __PKG__; -import io.datafusion.spark.FfiProviderFactory; +import io.datafusion.spark.BridgeProviderFactory; import io.datafusion.spark.ScanBackend; /** - * The bridge's contract with the Spark connector. This is a STATIC bridge — the provider is built - * inside this bridge's own cdylib — so the only required override is {@link #scanBackend()}. + * The bridge's contract with the Spark connector: the provider is built inside this bridge's own + * cdylib, and {@link #scanBackend()} is the only required method. * - *

          Useful optional overrides (see their javadoc on {@link FfiProviderFactory}): + *

          Useful optional overrides (see their javadoc on {@link BridgeProviderFactory}): * *

            *
          • {@code encodeOptions} — only if you have your own options schema; the default ships the @@ -19,7 +19,7 @@ * task per DataFusion output partition. Mind the determinism contract. *
          */ -public final class __PREFIX__ProviderFactory implements FfiProviderFactory { +public final class __PREFIX__ProviderFactory implements BridgeProviderFactory { @Override public ScanBackend scanBackend() { diff --git a/docs/source/contributor-guide/development.md b/docs/source/contributor-guide/development.md index cb80276..fdb00f4 100644 --- a/docs/source/contributor-guide/development.md +++ b/docs/source/contributor-guide/development.md @@ -75,7 +75,7 @@ disk space. The repository is a multi-module Maven build: - `Cargo.toml` — Rust workspace root declaring the three crate members - (`native`, `examples/native`, `spark/native`) and `[workspace.dependencies]` + (`native`, `native-common`, `examples/native`, `spark/bridge`) and `[workspace.dependencies]` that pin shared versions in one place. Cargo writes artifacts to `rust-target/` (overridden in `.cargo/config.toml`) so `mvn clean` at the repo root does not nuke the Rust build cache. @@ -84,12 +84,13 @@ The repository is a multi-module Maven build: - `core/` — `datafusion-java` library module (Java sources, tests, and generated protobuf classes). - `spark/` — `datafusion-java-spark` Spark DataSource V2 connector - (Scala + Java) and its `spark/native/` widening cdylib crate. + (Scala + Java, pure JVM) and its `spark/bridge/` Rust SDK crate + (`datafusion-spark-bridge`: widening, scan machinery, `export_bridge!`). - `examples/` — `datafusion-java-examples` module containing runnable examples that depend on the library; built alongside the library so they cannot fall out of sync with the API. Includes `examples/native/`, a - small FFI table-provider cdylib used by the Spark connector demo - (`ExampleFfiProviderFactory` + the pyspark script under + small `export_bridge!` cdylib used by the Spark connector demo + (`ExampleBridgeProviderFactory` + the pyspark script under `examples/python/`). - `native/` — `datafusion-jni` Rust crate (JNI + Arrow C Data Interface). - `proto/` — Protobuf definitions shared between Java and Rust. diff --git a/examples/README.md b/examples/README.md index 6876810..da9fec7 100644 --- a/examples/README.md +++ b/examples/README.md @@ -64,17 +64,18 @@ the result rows. Swap `SqlQueryExample` for any class in the table below. ## The Spark connector example One example is not a standalone `main`: -`ExampleFfiProviderFactory` implements the Spark connector's -`FfiProviderFactory` interface over a tiny Rust-built in-memory table (the -cdylib under [`native/`](native/)). It exists to be loaded *by Spark* — the -runnable end-to-end version is the PySpark demo under -[`python/`](python/), and the guide to building your own connector is +`ExampleBridgeProviderFactory` implements the Spark connector's +`BridgeProviderFactory` interface over a tiny in-memory table built inside +the example bridge cdylib (the `export_bridge!` crate under +[`native/`](native/)). It exists to be loaded *by Spark* — the runnable +end-to-end version is the PySpark demo under [`python/`](python/), and the +guide to building your own connector is [`../spark/README.md`](../spark/README.md). To build its cdylib (workspace member, buildable from anywhere in the tree): ```bash -cargo build -p datafusion-java-ffi-example --release +cargo build -p datafusion-java-example-bridge --release ``` Building the examples jar then bundles the cdylib inside it (under @@ -83,7 +84,7 @@ there at runtime via the connector's `NativeLibraryLoader` — the same packaging recipe a real bridge uses (see "Packaging your bridge" in [`../spark/README.md`](../spark/README.md)). To run against an unpackaged local build instead, pass -`-Dexample.ffi.lib.path=/abs/path/to/libdatafusion_java_ffi_example.{so,dylib}`. +`-Dexample.bridge.lib.path=/abs/path/to/libdatafusion_example_bridge.{so,dylib}`. ## Troubleshooting @@ -94,5 +95,5 @@ local build instead, pass built in a different profile than Maven expects. Re-run build step 1 and keep `-Ddatafusion.native.profile=release` consistent between the cargo profile (`--release`) and the Maven flag. -- **`UnsatisfiedLinkError ... datafusion_java_ffi_example`** — only the FFI - example's cdylib is missing; see "The Spark connector example" above. +- **`UnsatisfiedLinkError ... datafusion_example_bridge`** — only the example + bridge cdylib is missing; see "The Spark connector example" above. diff --git a/examples/native/Cargo.toml b/examples/native/Cargo.toml index 2d51ac2..1e362cc 100644 --- a/examples/native/Cargo.toml +++ b/examples/native/Cargo.toml @@ -9,19 +9,21 @@ # http://www.apache.org/licenses/LICENSE-2.0 [package] -name = "datafusion-java-ffi-example" +name = "datafusion-java-example-bridge" version = "0.1.0" edition = "2021" publish = false [lib] -# Built as a cdylib so the JVM-side example can System.load() the artifact. -# `rlib` lets us add Rust-level unit tests if needed. +name = "datafusion_example_bridge" +# Built as a cdylib so the JVM loads it via NativeLibraryLoader; `rlib` keeps +# the Rust-level unit tests (options decoding, partition layout) runnable. crate-type = ["cdylib", "rlib"] [dependencies] arrow = { workspace = true } datafusion = { workspace = true } -datafusion-ffi = { workspace = true } -jni = { workspace = true } +datafusion-spark-bridge = { path = "../../spark/bridge" } + +[dev-dependencies] tokio = { workspace = true } diff --git a/examples/native/src/lib.rs b/examples/native/src/lib.rs index 756618e..b0b17e8 100644 --- a/examples/native/src/lib.rs +++ b/examples/native/src/lib.rs @@ -15,20 +15,19 @@ // specific language governing permissions and limitations // under the License. -//! Example cdylib that produces a small DataFusion `MemTable` wrapped as an -//! `FFI_TableProvider`, returned to the JVM as a `jlong` (the raw boxed -//! pointer). The Spark connector consumes the pointer via -//! `FfiHelperNative.createScan` / `providerSchemaIpc`, which widen the -//! provider and plan/execute the scan inside the connector cdylib. +//! Example bridge cdylib: a small DataFusion `MemTable` exposed to Spark +//! through the `datafusion-spark-bridge` SDK. `export_bridge!` generates the +//! whole JNI surface for `org.apache.datafusion.examples.ExampleBridgeNative`; +//! this crate only decodes the options blob and builds the provider. //! -//! The same pattern is what domain bridges (HDF5, custom Iceberg, in-house formats) use -//! to expose their TableProviders to Spark via the connector-core DataSource -//! V2 plumbing. +//! The same pattern is what domain bridges (HDF5, custom Iceberg, in-house +//! formats) use to expose their TableProviders to Spark via the connector's +//! DataSource V2 plumbing. //! //! ## Options wire format //! -//! `createMemTableProvider` accepts an opaque `byte[]` that the JVM-side -//! `ExampleFfiProviderFactory.encodeOptions` produces. Layout (little-endian): +//! The provider builder accepts an opaque `byte[]` that the JVM-side +//! `ExampleBridgeProviderFactory.encodeOptions` produces. Layout (little-endian): //! //! ```text //! [u32 name_prefix_len][name_prefix UTF-8 bytes][u32 num_rows][u32 num_batches] @@ -38,10 +37,11 @@ //! Empty/`null` bytes decode as all defaults: `name_prefix="row"`, `num_rows=4`, //! `num_batches=1`, `num_partitions=1`, `shared_scan=false`. The trailing //! fields are optional so blobs from older encoders keep decoding. The -//! `shared_scan` flag is consumed JVM-side (`ExampleFfiProviderFactory.sharedScan`); +//! `shared_scan` flag is consumed JVM-side (`ExampleBridgeProviderFactory.sharedScan`); //! this decoder carries it only so one blob format serves both sides. Real -//! bridges use a real proto schema here; this example hand-rolls the encoding -//! to keep the wire layer obvious. +//! bridges can use the connector's default `OptionsCodec` instead (decoded via +//! `datafusion_spark_bridge::options`); this example hand-rolls the encoding +//! to show a custom wire layer. use std::sync::Arc; @@ -49,34 +49,7 @@ use arrow::array::{Float64Array, Int64Array, RecordBatch, StringArray}; use arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; use datafusion::catalog::TableProvider; use datafusion::datasource::MemTable; -use datafusion::execution::TaskContextProvider; -use datafusion::prelude::SessionContext; -use datafusion_ffi::execution::FFI_TaskContextProvider; -use datafusion_ffi::table_provider::FFI_TableProvider; -use jni::objects::{JByteArray, JClass}; -use jni::sys::jlong; -use jni::JNIEnv; -use tokio::runtime::{Handle, Runtime}; - -/// Tokio runtime that the FFI provider is anchored to. Shared across calls -/// for the lifetime of the cdylib so successive `createMemTableProvider` -/// invocations don't spawn fresh runtimes. -fn runtime() -> &'static Handle { - use std::sync::OnceLock; - static RT: OnceLock = OnceLock::new(); - RT.get_or_init(|| Runtime::new().expect("tokio runtime init failed")) - .handle() -} - -/// Host `SessionContext` used only to obtain a `TaskContextProvider` for -/// `FFI_TableProvider::new`. Static on purpose: the `FFI_TaskContextProvider` -/// holds a non-owning reference, so this context must outlive every provider -/// built from it. Nothing is ever registered on it. -fn host_session_context() -> &'static Arc { - use std::sync::OnceLock; - static CTX: OnceLock> = OnceLock::new(); - CTX.get_or_init(|| Arc::new(SessionContext::new())) -} +use datafusion_spark_bridge::{export_bridge, BridgeContext, JniResult}; #[derive(Debug)] struct Options { @@ -186,70 +159,22 @@ fn build_mem_table( Ok(Arc::new(MemTable::try_new(schema, partitions)?)) } -/// JNI entry point: decode the options blob, build a `MemTable` accordingly, -/// wrap it in an `FFI_TableProvider`, return the raw boxed pointer as a `jlong`. -/// Ownership of the boxed FFI transfers to the caller — the matching -/// `Box::from_raw` is performed by the consumer (the Spark connector's -/// `FfiHelperNative.createScan` / `providerSchemaIpc`). -#[no_mangle] -pub extern "system" fn Java_org_apache_datafusion_examples_FfiTableProviderExampleNative_createMemTableProvider< - 'local, ->( - mut env: JNIEnv<'local>, - _class: JClass<'local>, - options_bytes: JByteArray<'local>, -) -> jlong { - let result: Result> = (|| { - let bytes: Vec = if options_bytes.is_null() { - Vec::new() - } else { - env.convert_byte_array(&options_bytes) - .map_err(|e| format!("failed to read options byte[] from JVM: {e}"))? - }; - let opts = decode_options(&bytes)?; - - let mem_table = build_mem_table(&opts)?; - let provider: Arc = mem_table; - - let ctx_provider: Arc = - Arc::clone(host_session_context()) as Arc; - let ffi_task_ctx = FFI_TaskContextProvider::from(&ctx_provider); - let ffi = FFI_TableProvider::new( - provider, - /*can_support_pushdown_filters=*/ true, - Some(runtime().clone()), - ffi_task_ctx, - /*logical_codec=*/ None, - ); - Ok(Box::into_raw(Box::new(ffi)) as jlong) - })(); - - match result { - Ok(ptr) => ptr, - Err(err) => { - let _ = env.throw_new("java/lang/RuntimeException", err.to_string()); - 0 - } - } +/// Build the example provider for one scan: decode the options blob, build +/// the `MemTable` accordingly. `partition` is unused — the example reports a +/// single partition (or relies on shared-scan mode), so there is no per-task +/// payload to interpret. +fn build_provider( + _ctx: &BridgeContext, + options: &[u8], + _partition: &[u8], +) -> JniResult> { + let opts = decode_options(options)?; + Ok(build_mem_table(&opts)?) } -/// Drop a previously-created FFI_TableProvider whose pointer was NOT handed -/// off to a consumer. Exposed for the error path — callers that pass the -/// pointer to `createScan` / `providerSchemaIpc` must NOT also call this; -/// ownership has already transferred. -#[no_mangle] -pub extern "system" fn Java_org_apache_datafusion_examples_FfiTableProviderExampleNative_dropProvider< - 'local, ->( - _env: JNIEnv<'local>, - _class: JClass<'local>, - ffi_ptr: jlong, -) { - if ffi_ptr != 0 { - unsafe { - drop(Box::from_raw(ffi_ptr as *mut FFI_TableProvider)); - } - } +export_bridge! { + jni_class: "org_apache_datafusion_examples_ExampleBridgeNative", + build_provider: build_provider, } #[cfg(test)] @@ -316,6 +241,8 @@ mod tests { let table = build_mem_table(&opts).unwrap(); // MemTable has no partition accessor; verify via scan output partitioning. use datafusion::catalog::TableProvider; + use datafusion::prelude::SessionContext; + use tokio::runtime::Runtime; let ctx = SessionContext::new(); let rt = Runtime::new().unwrap(); let plan = rt diff --git a/examples/pom.xml b/examples/pom.xml index 1f156c6..02888c3 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -47,7 +47,7 @@ under the License. org.apache.datafusion datafusion-java - @@ -105,14 +105,14 @@ under the License. 3.1.0 - copy-ffi-example-cdylib + copy-example-bridge-cdylib process-classes run - + @@ -139,7 +139,7 @@ under the License. linux x86_64 - libdatafusion_java_ffi_example.so + libdatafusion_example_bridge.so @@ -150,7 +150,7 @@ under the License. linux x86_64 - libdatafusion_java_ffi_example.so + libdatafusion_example_bridge.so @@ -161,7 +161,7 @@ under the License. linux aarch64 - libdatafusion_java_ffi_example.so + libdatafusion_example_bridge.so @@ -172,7 +172,7 @@ under the License. darwin x86_64 - libdatafusion_java_ffi_example.dylib + libdatafusion_example_bridge.dylib @@ -183,7 +183,7 @@ under the License. darwin aarch64 - libdatafusion_java_ffi_example.dylib + libdatafusion_example_bridge.dylib diff --git a/examples/python/README.md b/examples/python/README.md index edf7f3f..b272d64 100644 --- a/examples/python/README.md +++ b/examples/python/README.md @@ -1,30 +1,28 @@ # PySpark end-to-end demo -`ffi_table_provider_demo.py` proves the full DataFusion → Spark path: +`bridge_demo.py` proves the full DataFusion → Spark path: ``` -examples/native (cdylib) <-- in-memory MemTable - | jlong (FFI_TableProvider*) - v -ExampleFfiProviderFactory <-- implements FfiProviderFactory +examples/native (export_bridge! cdylib) <-- in-memory MemTable + scan machinery + ^ byte[] options / FFI_ArrowArrayStream + | +ExampleBridgeProviderFactory <-- implements BridgeProviderFactory | Class.forName(...) v -datafusion-java-spark <-- DSv2 plumbing, widening, predicate xlate +datafusion-java-spark <-- DSv2 plumbing, predicate xlate | spark.read.format("datafusion") v -PySpark DataFrame <-- printSchema / show / filter / select +PySpark DataFrame <-- printSchema / show / filter / select ``` ## Prerequisites 1. **Java 17.** `JAVA_HOME` must point at a JDK 17 install. -2. **Three cdylibs** built from this repo: +2. **The example bridge cdylib** built from this repo: ```bash - cd native && cargo build --release && cd .. - cd examples/native && cargo build --release && cd ../.. - cd spark/native && cargo build --release && cd ../.. + cargo build -p datafusion-java-example-bridge --release ``` 3. **Maven artifacts installed into a side-loaded local repository.** @@ -71,7 +69,7 @@ PySpark DataFrame <-- printSchema / show / filter / select ## Run ```bash -examples/python/.venv/bin/python examples/python/ffi_table_provider_demo.py +examples/python/.venv/bin/python examples/python/bridge_demo.py ``` Expected output: @@ -129,6 +127,6 @@ Arrow batches cross back to Spark. - The `datafusion` format short name resolves via the SPI file in `spark/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister`. You can also use the FQCN: `format("io.datafusion.spark.DatafusionSource")`. -- To swap in your own bridge, write a `FfiProviderFactory` against your own - cdylib (mirroring `ExampleFfiProviderFactory`) and pass its FQCN via +- To swap in your own bridge, write a `BridgeProviderFactory` against your own + cdylib (mirroring `ExampleBridgeProviderFactory`) and pass its FQCN via `option("df.factory", ...)`. diff --git a/examples/python/ffi_table_provider_demo.py b/examples/python/bridge_demo.py similarity index 92% rename from examples/python/ffi_table_provider_demo.py rename to examples/python/bridge_demo.py index 1cff37b..bf3f6e6 100644 --- a/examples/python/ffi_table_provider_demo.py +++ b/examples/python/bridge_demo.py @@ -16,21 +16,19 @@ # specific language governing permissions and limitations # under the License. # -"""End-to-end PySpark demo of the DataFusion FFI table provider. +"""End-to-end PySpark demo of a DataFusion table provider exposed as a Spark data source. Wires the in-memory example MemTable produced by ``examples/native`` into a Spark DataSource V2 scan through the generic connector in ``spark/``. Prerequisites (run from the repo root): - cd native && cargo build --release && cd .. - cd examples/native && cargo build --release && cd ../.. - cd spark/native && cargo build --release && cd ../.. + cargo build --release --workspace mvn install -Ddatafusion.native.profile=release -DskipTests Run: - python3 examples/python/ffi_table_provider_demo.py + python3 examples/python/bridge_demo.py """ import glob @@ -125,7 +123,7 @@ def main() -> None: extra_classpath = ":".join(app_jars) spark = ( - SparkSession.builder.appName("datafusion-ffi-demo") + SparkSession.builder.appName("datafusion-bridge-demo") .master("local[2]") .config("spark.jars", jars) .config("spark.driver.extraClassPath", extra_classpath) @@ -143,11 +141,11 @@ def main() -> None: # The example cdylib is bundled inside the examples jar and extracted by # NativeLibraryLoader at first use; no working-directory or path setup is - # needed. (-Dexample.ffi.lib.path via extraJavaOptions overrides it for + # needed. (-Dexample.bridge.lib.path via extraJavaOptions overrides it for # unpackaged local builds.) # `name_prefix`, `num_rows`, `num_batches` are interpreted by - # ExampleFfiProviderFactory.encodeOptions and decoded on the Rust side + # ExampleBridgeProviderFactory.encodeOptions and decoded on the Rust side # in examples/native/src/lib.rs. They demonstrate driver-side options # flowing through to the native MemTable build. name_prefix = "user" @@ -157,7 +155,7 @@ def main() -> None: spark.read.format("datafusion") .option( "df.factory", - "org.apache.datafusion.examples.ExampleFfiProviderFactory", + "org.apache.datafusion.examples.ExampleBridgeProviderFactory", ) .option("name_prefix", name_prefix) .option("num_rows", str(num_rows)) @@ -184,7 +182,7 @@ def main() -> None: legacy_rows = {tuple(r) for r in df.collect()} # --- shared-scan mode ------------------------------------------------- - # `shared_scan=true` flips ExampleFfiProviderFactory.sharedScan: one + # `shared_scan=true` flips ExampleBridgeProviderFactory.sharedScan: one # provider + plan cached per executor, one Spark task per MemTable # partition (num_partitions=4), each task streaming one DataFusion plan # partition. Results must be identical to the legacy run above. @@ -193,7 +191,7 @@ def main() -> None: spark.read.format("datafusion") .option( "df.factory", - "org.apache.datafusion.examples.ExampleFfiProviderFactory", + "org.apache.datafusion.examples.ExampleBridgeProviderFactory", ) .option("name_prefix", name_prefix) .option("num_rows", str(num_rows)) diff --git a/examples/src/main/java/org/apache/datafusion/examples/ExampleBridgeNative.java b/examples/src/main/java/org/apache/datafusion/examples/ExampleBridgeNative.java new file mode 100644 index 0000000..dff42ee --- /dev/null +++ b/examples/src/main/java/org/apache/datafusion/examples/ExampleBridgeNative.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datafusion.examples; + +import io.datafusion.spark.NativeLibraryLoader; + +/** + * JNI surface generated on the Rust side by {@code export_bridge!} in {@code + * examples/native/src/lib.rs} with {@code jni_class = + * "org_apache_datafusion_examples_ExampleBridgeNative"} — the mangled binary name of THIS class. + * Renaming or moving this class requires regenerating the Rust macro invocation to match. + * + *

          The cdylib is bundled inside this jar at {@code org/apache/datafusion/examples///} + * (see the antrun execution in {@code examples/pom.xml}). For local hacking against an unpackaged + * build, {@code -Dexample.bridge.lib.path=/abs/path/to/libdatafusion_example_bridge.dylib} bypasses + * the bundled copy. + */ +final class ExampleBridgeNative { + + private ExampleBridgeNative() {} + + static { + String explicit = System.getProperty("example.bridge.lib.path"); + if (explicit != null && !explicit.isEmpty()) { + System.load(explicit); + } else { + NativeLibraryLoader.load( + ExampleBridgeNative.class, "org/apache/datafusion/examples", "datafusion_example_bridge"); + } + } + + static native byte[] providerSchemaIpc(byte[] options, byte[] partition); + + static native long createScan( + byte[] options, + byte[] partition, + int targetPartitions, + int batchSize, + String[] optionKeys, + String[] optionValues, + String[] projectionColumns, + byte[][] filterProtos); + + static native int partitionCount(long scanHandle); + + static native void executeStreamPartition(long scanHandle, int partition, long ffiStreamAddr); + + static native void executeStream(long scanHandle, long ffiStreamAddr); + + static native void closeScan(long scanHandle); +} diff --git a/examples/src/main/java/org/apache/datafusion/examples/ExampleFfiProviderFactory.java b/examples/src/main/java/org/apache/datafusion/examples/ExampleBridgeProviderFactory.java similarity index 81% rename from examples/src/main/java/org/apache/datafusion/examples/ExampleFfiProviderFactory.java rename to examples/src/main/java/org/apache/datafusion/examples/ExampleBridgeProviderFactory.java index 3059830..27391fa 100644 --- a/examples/src/main/java/org/apache/datafusion/examples/ExampleFfiProviderFactory.java +++ b/examples/src/main/java/org/apache/datafusion/examples/ExampleBridgeProviderFactory.java @@ -24,18 +24,19 @@ import java.nio.charset.StandardCharsets; import java.util.Map; -import io.datafusion.spark.FfiProviderFactory; +import io.datafusion.spark.BridgeProviderFactory; import io.datafusion.spark.PartitionInfo; +import io.datafusion.spark.ScanBackend; /** - * Minimal {@link FfiProviderFactory} that exposes the example {@code MemTable} produced by {@link - * FfiTableProviderExampleNative#createMemTableProvider(byte[])} as a Spark DataSource V2 source. + * Minimal {@link BridgeProviderFactory} that exposes the example {@code MemTable} built inside the + * example bridge cdylib (see {@code examples/native}) as a Spark DataSource V2 source. * *

          Wire it into PySpark with: * *

          {@code
            * df = (spark.read.format("datafusion")
          - *         .option("df.factory", "org.apache.datafusion.examples.ExampleFfiProviderFactory")
          + *         .option("df.factory", "org.apache.datafusion.examples.ExampleBridgeProviderFactory")
            *         .option("name_prefix", "user")
            *         .option("num_rows", "5")
            *         .option("num_batches", "3")
          @@ -71,12 +72,11 @@
            * two trailing fields are optional so older blobs keep decoding.
            *
            * 

          In the default mode a single partition (id {@code "p0"}, empty {@code partitionBytes}, no - * preferred host) is reported so Spark spawns one task; the executor calls {@link - * #createProvider(byte[], byte[])} to obtain a fresh {@code FFI_TableProvider} pointer, hands it to - * {@code FfiHelperNative.createScan}, and streams the resulting Arrow record batches back into the - * Spark scan. + * preferred host) is reported so Spark spawns one task; the executor hands the options bytes to + * {@code ExampleBridgeNative.createScan}, which builds the {@code MemTable} provider in process and + * streams the resulting Arrow record batches back into the Spark scan. */ -public final class ExampleFfiProviderFactory implements FfiProviderFactory { +public final class ExampleBridgeProviderFactory implements BridgeProviderFactory { static final String OPT_NAME_PREFIX = "name_prefix"; static final String OPT_NUM_ROWS = "num_rows"; @@ -89,7 +89,7 @@ public final class ExampleFfiProviderFactory implements FfiProviderFactory { static final int DEFAULT_NUM_BATCHES = 1; static final int DEFAULT_NUM_PARTITIONS = 1; - public ExampleFfiProviderFactory() {} + public ExampleBridgeProviderFactory() {} @Override public byte[] encodeOptions(Map sparkOptions) { @@ -123,7 +123,7 @@ public PartitionInfo[] listPartitions(byte[] optionsProtoBytes, byte[][] filterP // The example cannot prune its single partition, but a real bridge would inspect the // pushed predicates here and drop partitions that cannot match. System.out.println( - "ExampleFfiProviderFactory.listPartitions received " + "ExampleBridgeProviderFactory.listPartitions received " + filterProtoBytes.length + " pushed filter(s)"); return listPartitions(optionsProtoBytes); @@ -149,13 +149,8 @@ private static boolean hasTrailingFields(byte[] bytes) { } @Override - public long createProvider(byte[] optionsProtoBytes, byte[] partitionBytes) { - // The example bridge has no per-partition state; `partitionBytes` is ignored. - // The print makes provider-build amortization observable in the demo: shared-scan - // mode builds once per (executor x query) regardless of task count, while the - // per-partition path builds once per task. - System.out.println("ExampleFfiProviderFactory.createProvider building a MemTable provider"); - return FfiTableProviderExampleNative.createMemTableProvider(optionsProtoBytes); + public ScanBackend scanBackend() { + return new ExampleScanBackend(); } private static int parsePositiveInt(Map opts, String key, int defaultValue) { @@ -168,11 +163,11 @@ private static int parsePositiveInt(Map opts, String key, int de parsed = Integer.parseInt(raw.trim()); } catch (NumberFormatException e) { throw new IllegalArgumentException( - "ExampleFfiProviderFactory: option '" + key + "' must be an integer, got: " + raw); + "ExampleBridgeProviderFactory: option '" + key + "' must be an integer, got: " + raw); } if (parsed <= 0) { throw new IllegalArgumentException( - "ExampleFfiProviderFactory: option '" + key + "' must be > 0, got: " + parsed); + "ExampleBridgeProviderFactory: option '" + key + "' must be > 0, got: " + parsed); } return parsed; } diff --git a/spark/src/main/java/io/datafusion/spark/FfiScanBackend.java b/examples/src/main/java/org/apache/datafusion/examples/ExampleScanBackend.java similarity index 59% rename from spark/src/main/java/io/datafusion/spark/FfiScanBackend.java rename to examples/src/main/java/org/apache/datafusion/examples/ExampleScanBackend.java index 2da91fb..9854817 100644 --- a/spark/src/main/java/io/datafusion/spark/FfiScanBackend.java +++ b/examples/src/main/java/org/apache/datafusion/examples/ExampleScanBackend.java @@ -17,26 +17,16 @@ * under the License. */ -package io.datafusion.spark; +package org.apache.datafusion.examples; -/** - * Generic FFI {@link ScanBackend}: asks the factory for a raw {@code FFI_TableProvider} pointer and - * routes everything through the connector's own cdylib ({@link FfiHelperNative}). This is the - * {@link FfiProviderFactory#scanBackend()} default; bridges that statically link their provider via - * {@code export_bridge!} replace it with a backend delegating to their own native class. - */ -public final class FfiScanBackend implements ScanBackend { - - private final FfiProviderFactory factory; +import io.datafusion.spark.ScanBackend; - public FfiScanBackend(FfiProviderFactory factory) { - this.factory = factory; - } +/** Routes the connector's scan calls to the example bridge cdylib. Pure delegation. */ +final class ExampleScanBackend implements ScanBackend { @Override public byte[] providerSchemaIpc(byte[] options, byte[] partitionBytes) { - long ptr = factory.createProvider(options, partitionBytes); - return FfiHelperNative.providerSchemaIpc(ptr); + return ExampleBridgeNative.providerSchemaIpc(options, partitionBytes); } @Override @@ -49,9 +39,9 @@ public long createScan( String[] optionValues, String[] projectionColumns, byte[][] filterProtos) { - long ptr = factory.createProvider(options, partitionBytes); - return FfiHelperNative.createScan( - ptr, + return ExampleBridgeNative.createScan( + options, + partitionBytes, targetPartitions, batchSize, optionKeys, @@ -62,21 +52,21 @@ public long createScan( @Override public int partitionCount(long scanHandle) { - return FfiHelperNative.partitionCount(scanHandle); + return ExampleBridgeNative.partitionCount(scanHandle); } @Override public void executeStreamPartition(long scanHandle, int partition, long ffiStreamAddr) { - FfiHelperNative.executeStreamPartition(scanHandle, partition, ffiStreamAddr); + ExampleBridgeNative.executeStreamPartition(scanHandle, partition, ffiStreamAddr); } @Override public void executeStream(long scanHandle, long ffiStreamAddr) { - FfiHelperNative.executeStream(scanHandle, ffiStreamAddr); + ExampleBridgeNative.executeStream(scanHandle, ffiStreamAddr); } @Override public void closeScan(long scanHandle) { - FfiHelperNative.closeScan(scanHandle); + ExampleBridgeNative.closeScan(scanHandle); } } diff --git a/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExampleNative.java b/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExampleNative.java deleted file mode 100644 index 333fddf..0000000 --- a/examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExampleNative.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.datafusion.examples; - -import io.datafusion.spark.NativeLibraryLoader; - -/** - * JNI bindings into the example cdylib at {@code examples/native}. The cdylib produces a small - * {@code MemTable}-backed {@code FFI_TableProvider} that {@link ExampleFfiProviderFactory} hands to - * the Spark connector ({@code FfiHelperNative.createScan}). - * - *

          The cdylib is bundled inside this jar at {@code org/apache/datafusion/examples///} - * (see the antrun execution in {@code examples/pom.xml}) and extracted/loaded once via the - * connector's {@link NativeLibraryLoader} — the same two-piece recipe (pom copy block + one loader - * call) a real bridge uses to ship its own cdylib. For local hacking against an unpackaged build, - * {@code -Dexample.ffi.lib.path=/abs/path/to/libdatafusion_java_ffi_example.dylib} bypasses the - * bundled copy. - */ -final class FfiTableProviderExampleNative { - - private FfiTableProviderExampleNative() {} - - static { - String explicit = System.getProperty("example.ffi.lib.path"); - if (explicit != null && !explicit.isEmpty()) { - System.load(explicit); - } else { - NativeLibraryLoader.load( - FfiTableProviderExampleNative.class, - "org/apache/datafusion/examples", - "datafusion_java_ffi_example"); - } - } - - /** - * Build a {@code MemTable} on the Rust side, wrap it in an {@code FFI_TableProvider}, and return - * the raw boxed pointer as a {@code long}. Ownership transfers to the caller; passing the pointer - * to a consumer such as {@code FfiHelperNative.createScan} discharges it. - * - *

          {@code optionsBytes} is the length-prefixed binary blob produced by {@link - * ExampleFfiProviderFactory#encodeOptions(java.util.Map)}. An empty or {@code null} array decodes - * as all defaults ({@code name_prefix="row"}, {@code num_rows=4}, {@code num_batches=1}). - */ - static native long createMemTableProvider(byte[] optionsBytes); - - /** - * Drop an FFI_TableProvider pointer that was NEVER handed to a consumer. Call this only on the - * error path before handover; once {@code FfiHelperNative.createScan} (or {@code - * providerSchemaIpc}) accepts the pointer it owns the box. - */ - static native void dropProvider(long ffiTableProviderPtr); -} diff --git a/spark/README.md b/spark/README.md index cd3443f..bbbf48c 100644 --- a/spark/README.md +++ b/spark/README.md @@ -14,35 +14,31 @@ supplies everything else. ## How it fits together -Three layers, two of which already exist: +Two layers, one of which already exists: ``` your bridge (you write this) this module (already written) +--------------------------------+ +----------------------------------+ -| Rust cdylib | | connector cdylib (spark/native) | -| builds your TableProvider, | | type widening, session setup, | -| wraps it as FFI_TableProvider|-->| projection, filters, planning, | -| | | partition streams | -| Java FfiProviderFactory | | Scala/Java DSv2 plumbing | -| turns Spark options into | | (spark/src) schema inference, | -| bytes, hands pointers across |-->| pushdown, task planning, | -| | | shared-scan cache | +| cdylib on datafusion-spark- | | Scala/Java DSv2 plumbing | +| bridge (spark/bridge SDK): | | (spark/src) schema inference, | +| your TableProvider + one |<--| pushdown, task planning, | +| export_bridge! invocation; |-->| shared-scan cache | +| the SDK supplies widening, | | | +| session, filters, planning, | | (pure JVM — all native code | +| partition streams | | ships inside YOUR jar) | +--------------------------------+ +----------------------------------+ | v spark.read.format("...").load() ``` -The only things that cross between your Rust code and the connector are: - -- an opaque `FFI_TableProvider` pointer (your provider, handed over as a - `long`), and -- opaque `byte[]` blobs that *you* define (your options and per-partition - payloads — the connector never inspects them). - +The only things that cross between the JVM and your cdylib are opaque +`byte[]` blobs that *you* define (options and per-partition payloads — the +connector never inspects them) going in, and Arrow C streams coming back. Everything DataFusion-side (planning, filter application, execution) happens -inside the connector's native library. There is no DataFusion session on the -JVM side at all. +inside your bridge's native library. There is no DataFusion session on the +JVM side at all, and no `FFI_TableProvider` boundary anywhere — your +concrete provider is linked into the same cdylib as the scan machinery. ## Getting started: generate a bridge @@ -62,8 +58,8 @@ sections below explain what each generated piece is for. | # | Piece | Language | Contract lives at | Working example | |---|-------|----------|-------------------|-----------------| -| 1 | A JNI entry point that builds your `TableProvider` and returns a raw `FFI_TableProvider` pointer | Rust | — (plain `#[no_mangle]` JNI fn) | [`examples/native/src/lib.rs`](../examples/native/src/lib.rs) | -| 2 | An `FfiProviderFactory` implementation | Java | [`src/main/java/io/datafusion/spark/FfiProviderFactory.java`](src/main/java/io/datafusion/spark/FfiProviderFactory.java) | [`examples/.../ExampleFfiProviderFactory.java`](../examples/src/main/java/org/apache/datafusion/examples/ExampleFfiProviderFactory.java) | +| 1 | A provider builder + one `export_bridge!` invocation | Rust | [`bridge/src/lib.rs`](bridge/src/lib.rs) (macro rustdoc) | [`examples/native/src/lib.rs`](../examples/native/src/lib.rs) | +| 2 | A `BridgeProviderFactory` implementation (one required method) + the JNI/backend boilerplate | Java | [`src/main/java/io/datafusion/spark/BridgeProviderFactory.java`](src/main/java/io/datafusion/spark/BridgeProviderFactory.java) | [`examples/.../ExampleBridgeProviderFactory.java`](../examples/src/main/java/org/apache/datafusion/examples/ExampleBridgeProviderFactory.java) | | 3 | (optional) A `DatafusionSource` subclass giving your source a short name | Scala/Java | [`src/main/scala/io/datafusion/spark/DatafusionSource.scala`](src/main/scala/io/datafusion/spark/DatafusionSource.scala) | see "Wiring it into Spark" below | An end-to-end runnable version of all three — in-memory table, factory, and a @@ -72,10 +68,9 @@ PySpark script that scans, filters, and projects it — lives under ### 1. The Rust side -Two ways to build your cdylib. **Static (preferred when you own the -provider's source):** depend on the [`datafusion-spark-bridge`](bridge/) -SDK crate and let it generate the JNI surface — no `FFI_TableProvider`, no -`datafusion-ffi` ABI coupling, one cdylib, your choice of DataFusion version: +Depend on the [`datafusion-spark-bridge`](bridge/) SDK crate and let it +generate the JNI surface; you supply one builder turning your option / +partition bytes into a concrete `TableProvider`: ```rust use std::sync::Arc; @@ -102,62 +97,26 @@ export_bridge! { The macro's rustdoc lists the exact `static native` method set the named Java class must declare; your factory routes the connector to it by -overriding `scanBackend()` (see section 2). - -**FFI (when the provider arrives precompiled, or must stay on a different -DataFusion version):** one JNI function that decodes your options bytes, -builds an `Arc`, and wraps it: - -```rust -/// Host SessionContext for FFI_TableProvider::new's task-context plumbing. -/// MUST outlive every provider built from it — the FFI_TaskContextProvider -/// holds a non-owning reference, and the connector calls back through it on -/// every scan. Keep it in a static; a function-local context dropped after -/// this call leaves the provider with a dangling task-context source. -fn host_session_context() -> &'static Arc { - static CTX: OnceLock> = OnceLock::new(); - CTX.get_or_init(|| Arc::new(SessionContext::new())) -} - -let provider: Arc = runtime().block_on(build_provider(opts))?; -let ctx_provider: Arc = - Arc::clone(host_session_context()) as Arc; -let ffi = FFI_TableProvider::new( - provider, - /*can_support_pushdown_filters=*/ true, - Some(runtime().clone()), - FFI_TaskContextProvider::from(&ctx_provider), - /*logical_codec=*/ None, // default DataFusion codec -); -Box::into_raw(Box::new(ffi)) as jlong -``` - -Two lifetime rules: - -- Ownership of the returned pointer transfers to whoever you hand it to (the - factory passes it straight into the connector). -- The `SessionContext` behind the `FFI_TaskContextProvider` must live as long - as any provider built from it — hence the `static` above. Nothing is ever - registered on it; it exists only so scans can obtain a task context. +overriding `scanBackend()` (see section 2). One cdylib total: your provider +and the SDK's scan machinery are the same library, so there is no provider +hand-off across a binary boundary and no `datafusion-ffi` anywhere. The +builder receives empty partition bytes for the driver-side schema probe — +schema must not depend on per-partition state. [`examples/native/src/lib.rs`](../examples/native/src/lib.rs) is a complete, commented version of this for a `MemTable`. ### 2. The Java factory -`FfiProviderFactory` is the contract between Spark and your bridge. It must -have a no-arg constructor (executors instantiate it reflectively by class -name). Everything has a working default — Spark options are encoded with -`OptionsCodec` (decode them in Rust via +`BridgeProviderFactory` is the contract between Spark and your bridge. It +must have a no-arg constructor (executors instantiate it reflectively by +class name). The single required method is `scanBackend()` — Spark options +are encoded with `OptionsCodec` by default (decode them in Rust via `datafusion_spark_bridge::options::decode_options`), and `listPartitions` -reports one whole-dataset partition — so a minimal bridge overrides exactly -one method, chosen by which native path it uses. - -**Static bridge:** override `scanBackend()` to delegate to the JNI class you -named in `export_bridge!`: +defaults to one whole-dataset partition: ```java -public final class MyBridgeProviderFactory implements FfiProviderFactory { +public final class MyBridgeProviderFactory implements BridgeProviderFactory { @Override public ScanBackend scanBackend() { @@ -167,7 +126,9 @@ public final class MyBridgeProviderFactory implements FfiProviderFactory { /** Declares the native methods generated by export_bridge! and loads the cdylib. */ final class BridgeNative { - static { /* load your cdylib once, e.g. via a NativeLibraryLoader-style helper */ } + static { + NativeLibraryLoader.load(BridgeNative.class, "com/example/mybridge", "my_bridge"); + } static native byte[] providerSchemaIpc(byte[] options, byte[] partition); static native long createScan(byte[] options, byte[] partition, int targetPartitions, int batchSize, String[] optionKeys, @@ -180,22 +141,7 @@ final class BridgeNative { ``` (`MyBridgeBackend implements ScanBackend` forwards each method to -`BridgeNative` — pure boilerplate the scaffold will generate.) - -**FFI bridge:** override `createProvider` instead; the default -`scanBackend()` routes the pointer through the connector's own cdylib: - -```java -public final class MyBridgeProviderFactory implements FfiProviderFactory { - - /** Build the provider for one slice. Called with EMPTY partitionBytes for - * the driver-side schema probe — schema must not depend on the slice. */ - @Override - public long createProvider(byte[] optionsProtoBytes, byte[] partitionBytes) { - return MyBridgeNative.createFfiProvider(optionsProtoBytes, partitionBytes); - } -} -``` +`BridgeNative` — pure boilerplate the scaffold generates.) Override `encodeOptions` only if the bridge already has its own options schema (e.g. a protobuf), and `listPartitions` when the dataset should split @@ -216,7 +162,7 @@ into more than one Spark task: The remaining optional methods — `sharedScan`, `reportPartitioning`, and the filter-aware `listPartitions(opts, filters)` overload — are covered in their own sections below. Their javadoc in -[`FfiProviderFactory.java`](src/main/java/io/datafusion/spark/FfiProviderFactory.java) +[`BridgeProviderFactory.java`](src/main/java/io/datafusion/spark/BridgeProviderFactory.java) is the authoritative contract. ### 3. Wiring it into Spark @@ -270,9 +216,9 @@ static { The pom side is one antrun copy execution plus per-host profiles; the examples module is a complete working copy of the pattern (see the -`copy-ffi-example-cdylib` execution and the `native-*` profiles in +`copy-example-bridge-cdylib` execution and the `native-*` profiles in [`examples/pom.xml`](../examples/pom.xml), and the loader call in -[`FfiTableProviderExampleNative.java`](../examples/src/main/java/org/apache/datafusion/examples/FfiTableProviderExampleNative.java)). +[`ExampleBridgeNative.java`](../examples/src/main/java/org/apache/datafusion/examples/ExampleBridgeNative.java)). For a multi-platform jar, build the cdylib per platform in CI and copy each into its own `//` directory before `mvn package` — the layout supports them side by side. @@ -416,7 +362,7 @@ executor's partition count diverges from the driver's, but equal counts with different contents are undetectable by construction. The provider's `ExecutionPlan` must also tolerate `execute(i)` being called more than once per plan instance (Spark retries and speculatively re-executes tasks). Full -contract: `FfiProviderFactory.sharedScan` javadoc. +contract: `BridgeProviderFactory.sharedScan` javadoc. Shared-scan operational details: @@ -461,48 +407,47 @@ Shared-scan operational details: | Phase | Where | Path | | ----- | ----- | ---- | -| Schema inference | Driver | `factory.encodeOptions` → `factory.createProvider(opts, EMPTY)` → connector cdylib widens + returns the Arrow schema | +| Schema inference | Driver | `factory.encodeOptions` → `backend.providerSchemaIpc(opts, EMPTY)` — bridge cdylib builds + widens the provider, returns the Arrow schema | | Scan planning (default mode) | Driver | `factory.listPartitions(opts[, filters])` → one task per entry, with its `partitionBytes` + `preferredLocations` | | Scan planning (shared-scan) | Driver | probe build (same code path executors use) → plan partition count `N` → `N` tasks | | Predicate translation | Driver | `SparkPredicateTranslator` → proto bytes per pushed predicate | -| Per-task scan (default mode) | Executor | `createProvider(opts, partitionBytes)` → `FfiHelperNative.createScan` (widen, project, filter, plan) → stream whole plan | +| Per-task scan (default mode) | Executor | `backend.createScan(opts, partitionBytes, ...)` (build provider, widen, project, filter, plan) → stream whole plan | | Per-task scan (shared-scan) | Executor | cache-acquire by `scanId` (first task builds) → stream plan partition `i` → release | -The JNI surface backing all of this is -[`FfiHelperNative.java`](src/main/java/io/datafusion/spark/FfiHelperNative.java) -/ [`native/src/scan.rs`](native/src/scan.rs). +The native machinery backing all of this is +[`bridge/src/scan.rs`](bridge/src/scan.rs), exported into each bridge's +cdylib by `export_bridge!` and reached through its [`ScanBackend`](src/main/java/io/datafusion/spark/ScanBackend.java). ## Module layout ``` spark/ -├── src/main/java/io/datafusion/spark/ public SPI + JNI boundary (Java on -│ purpose: bridge jars stay Scala-free) -│ FfiProviderFactory.java <- the contract you implement -│ ScanBackend.java <- native scan surface (per-bridge -│ or the generic FfiScanBackend) +├── src/main/java/io/datafusion/spark/ public SPI (Java on purpose: +│ bridge jars stay Scala-free) +│ BridgeProviderFactory.java <- the contract you implement +│ ScanBackend.java <- native scan surface (delegations +│ to your bridge's JNI class) +│ NativeLibraryLoader.java <- bundled-cdylib extraction/loading │ PartitionInfo.java <- one entry = one Spark task │ ReportedPartitioning.java <- optional shuffle-elision declaration -│ FfiHelperNative.java <- JNI into the connector cdylib ├── src/main/scala/io/datafusion/spark/ connector internals (DSv2 wiring, │ readers, pushdown, shared-scan cache) -├── bridge/ datafusion-spark-bridge SDK rlib: -│ widening + scan machinery + -│ export_bridge! for static bridges -└── native/ connector cdylib: thin JNI shims for - the generic FfiHelperNative (FFI - path), all logic in bridge/ +└── bridge/ datafusion-spark-bridge SDK rlib: + widening + scan machinery + + export_bridge! (the native side of + every bridge cdylib) ``` ## Caveats -- One logical-extension codec per provider — the connector uses DataFusion's - default codec when deserializing pushed filter expressions, which covers - columns, literals, and built-in functions. Bridges whose providers - round-trip custom `LogicalNode`s need a custom codec at - `FFI_TableProvider::new` time. -- Each cdylib brings its own Tokio runtime and (for TLS-using bridges) its - own rustls install. Both should be `Once`-gated in your bridge. -- The connector and your bridge must agree on the `datafusion-ffi` ABI — - build both against the same DataFusion major version (this repo pins it in - the workspace [`Cargo.toml`](../Cargo.toml)). +- Pushed filter expressions are deserialized with DataFusion's default + logical-extension codec, which covers columns, literals, and built-in + functions. Anything the Spark-side translator can't express stays in Spark + as a residual filter, so coverage gaps cost performance, never + correctness. +- The bridge cdylib's DataFusion version is the SDK's: cargo resolves one + `datafusion` for your provider and the scan machinery together, pinned in + this repo's workspace [`Cargo.toml`](../Cargo.toml). Upgrading DataFusion + means rebuilding the bridge against a newer SDK. +- The SDK's Tokio runtime is per-cdylib and `Once`-gated; TLS-using bridges + should `Once`-gate their rustls install the same way. diff --git a/spark/bridge/Cargo.toml b/spark/bridge/Cargo.toml index 26abe2a..8ed4684 100644 --- a/spark/bridge/Cargo.toml +++ b/spark/bridge/Cargo.toml @@ -22,18 +22,10 @@ edition = "2021" publish = false description = "SDK for building Spark connector bridges over DataFusion TableProviders" -[features] -default = ["ffi"] -# Import providers across a cdylib boundary as FFI_TableProvider. Bridges -# that statically link their provider via `export_bridge!` don't need it -# and can drop the datafusion-ffi dependency entirely. -ffi = ["dep:datafusion-ffi"] - [dependencies] arrow = { workspace = true } async-trait = { workspace = true } datafusion = { workspace = true } -datafusion-ffi = { workspace = true, optional = true } datafusion-jni-common = { path = "../../native-common" } datafusion-proto = { workspace = true } futures = { workspace = true } diff --git a/spark/bridge/src/ffi.rs b/spark/bridge/src/ffi.rs deleted file mode 100644 index 1ac8630..0000000 --- a/spark/bridge/src/ffi.rs +++ /dev/null @@ -1,43 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Importing providers across a cdylib boundary (the generic FFI path). - -use std::sync::Arc; - -use datafusion::catalog::TableProvider; -use datafusion_ffi::table_provider::FFI_TableProvider; -use datafusion_jni_common::errors::JniResult; -use jni::sys::jlong; - -/// Take ownership of a bridge cdylib's `FFI_TableProvider` pointer and return -/// the in-process provider view. The pointer must be the raw boxed address -/// (`Box::into_raw(Box::new(FFI_TableProvider))`) and must not be reused -/// after this call. -pub fn import_ffi_provider(ffi_raw_ptr: jlong) -> JniResult> { - if ffi_raw_ptr == 0 { - return Err("FFI_TableProvider pointer is null".into()); - } - let ffi_raw: Box = - unsafe { Box::from_raw(ffi_raw_ptr as *mut FFI_TableProvider) }; - // `Arc::::from(&FFI_TableProvider)` returns a - // ForeignTableProvider that delegates through the producer's vtable; it - // owns its own retained copy, so our Box can drop immediately. - let provider: Arc = (&*ffi_raw).into(); - drop(ffi_raw); - Ok(provider) -} diff --git a/spark/bridge/src/lib.rs b/spark/bridge/src/lib.rs index b9d0f82..90e5e24 100644 --- a/spark/bridge/src/lib.rs +++ b/spark/bridge/src/lib.rs @@ -20,27 +20,15 @@ //! Everything the Spark connector needs DataFusion-side lives here: the //! Spark-type [`widening`] layer, and the [`scan`] machinery (session from //! pinned config, projection, proto filters, planning, partition streams). -//! Two ways to consume it: -//! -//! - **Static bridge (preferred when you own the provider's source).** Your -//! cdylib depends on this crate and invokes [`export_bridge!`] with a -//! builder that constructs your concrete `TableProvider` from option / -//! partition bytes. One cdylib, no `datafusion-ffi` ABI boundary, your -//! choice of DataFusion version. -//! -//! - **FFI bridge (when the provider arrives precompiled).** A cdylib takes -//! a raw `FFI_TableProvider` pointer from another library and imports it -//! via [`ffi::import_ffi_provider`]. This is what the connector's own -//! `datafusion-spark-helper` cdylib does for the generic -//! `io.datafusion.spark.FfiHelperNative` path. +//! A bridge cdylib depends on this crate and invokes [`export_bridge!`] with +//! a builder that constructs its concrete `TableProvider` from option / +//! partition bytes — one cdylib, no FFI provider boundary; the only foreign +//! interface is JNI plus Arrow's C stream for the results. pub mod options; pub mod scan; pub mod widening; -#[cfg(feature = "ffi")] -pub mod ffi; - // Re-exported so `export_bridge!` expansions resolve these crates inside the // bridge author's crate without extra dependencies, and so builder signatures // can be written against `datafusion_spark_bridge::datafusion::...`. @@ -94,9 +82,7 @@ pub(crate) fn runtime_handle() -> &'static Handle { /// (`com.example.mybridge.BridgeNative` → `"com_example_mybridge_BridgeNative"`). /// If the class or package name itself contains an underscore, JNI mangling /// requires it written as `_1`. Per-bridge class names are what let several -/// bridges coexist in one Spark JVM — never export under -/// `io_datafusion_spark_FfiHelperNative`, that name belongs to the generic -/// FFI helper. +/// bridges coexist in one Spark JVM. /// /// `build_provider` is anything callable as /// `Fn(&BridgeContext, &[u8], &[u8]) -> JniResult>`, diff --git a/spark/bridge/src/options.rs b/spark/bridge/src/options.rs index b794561..117ca9d 100644 --- a/spark/bridge/src/options.rs +++ b/spark/bridge/src/options.rs @@ -17,7 +17,7 @@ //! Decoder for the connector's default options wire format. //! -//! `FfiProviderFactory.encodeOptions`'s default (`OptionsCodec` on the JVM +//! `BridgeProviderFactory.encodeOptions`'s default (`OptionsCodec` on the JVM //! side) encodes the Spark options map as length-prefixed UTF-8 pairs, //! sorted by key: big-endian `i32` entry count, then per entry key length, //! key bytes, value length, value bytes. Key-sorting makes the bytes a pure diff --git a/spark/native/Cargo.toml b/spark/native/Cargo.toml deleted file mode 100644 index cfdb1db..0000000 --- a/spark/native/Cargo.toml +++ /dev/null @@ -1,25 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 - -[package] -name = "datafusion-spark-helper" -version = "0.1.0" -edition = "2021" -publish = false - -[lib] -# cdylib for the JVM to load via System.load. All logic lives in the -# datafusion-spark-bridge rlib; this crate is only the JNI symbol surface -# for the generic io.datafusion.spark.FfiHelperNative class. -crate-type = ["cdylib"] - -[dependencies] -datafusion-spark-bridge = { path = "../bridge" } -jni = { workspace = true } diff --git a/spark/native/src/lib.rs b/spark/native/src/lib.rs deleted file mode 100644 index e89918f..0000000 --- a/spark/native/src/lib.rs +++ /dev/null @@ -1,104 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Generic FFI-path cdylib behind `io.datafusion.spark.FfiHelperNative`. -//! -//! Thin JNI shims: each entry point imports the bridge cdylib's raw -//! `FFI_TableProvider` pointer and delegates to the scan machinery in -//! `datafusion-spark-bridge` (widening, session from pinned config, -//! projection, proto filters, planning, partition streams). Bridges that -//! statically link their provider use `datafusion_spark_bridge::export_bridge!` -//! with their own JNI class name instead of this library. - -use datafusion_spark_bridge::ffi::import_ffi_provider; -use datafusion_spark_bridge::scan; -use jni::objects::{JClass, JObjectArray}; -use jni::sys::{jbyteArray, jint, jlong}; -use jni::JNIEnv; - -#[no_mangle] -pub extern "system" fn Java_io_datafusion_spark_FfiHelperNative_providerSchemaIpc<'local>( - mut env: JNIEnv<'local>, - _class: JClass<'local>, - ffi_raw_ptr: jlong, -) -> jbyteArray { - scan::provider_schema_ipc(&mut env, |_env| import_ffi_provider(ffi_raw_ptr)) -} - -#[no_mangle] -#[allow(clippy::too_many_arguments)] -pub extern "system" fn Java_io_datafusion_spark_FfiHelperNative_createScan<'local>( - mut env: JNIEnv<'local>, - _class: JClass<'local>, - ffi_raw_ptr: jlong, - target_partitions: jint, - batch_size: jint, - option_keys: JObjectArray<'local>, - option_values: JObjectArray<'local>, - projection_columns: JObjectArray<'local>, - filter_protos: JObjectArray<'local>, -) -> jlong { - scan::create_scan( - &mut env, - |_env| import_ffi_provider(ffi_raw_ptr), - target_partitions, - batch_size, - &option_keys, - &option_values, - &projection_columns, - &filter_protos, - ) -} - -#[no_mangle] -pub extern "system" fn Java_io_datafusion_spark_FfiHelperNative_partitionCount<'local>( - mut env: JNIEnv<'local>, - _class: JClass<'local>, - handle: jlong, -) -> jint { - scan::partition_count(&mut env, handle) -} - -#[no_mangle] -pub extern "system" fn Java_io_datafusion_spark_FfiHelperNative_executeStreamPartition<'local>( - mut env: JNIEnv<'local>, - _class: JClass<'local>, - handle: jlong, - partition: jint, - ffi_stream_addr: jlong, -) { - scan::execute_stream_partition(&mut env, handle, partition, ffi_stream_addr) -} - -#[no_mangle] -pub extern "system" fn Java_io_datafusion_spark_FfiHelperNative_executeStream<'local>( - mut env: JNIEnv<'local>, - _class: JClass<'local>, - handle: jlong, - ffi_stream_addr: jlong, -) { - scan::execute_stream(&mut env, handle, ffi_stream_addr) -} - -#[no_mangle] -pub extern "system" fn Java_io_datafusion_spark_FfiHelperNative_closeScan<'local>( - mut env: JNIEnv<'local>, - _class: JClass<'local>, - handle: jlong, -) { - scan::close_scan(&mut env, handle) -} diff --git a/spark/pom.xml b/spark/pom.xml index 05355de..90e4e6d 100644 --- a/spark/pom.xml +++ b/spark/pom.xml @@ -34,19 +34,18 @@ under the License. Apache DataFusion Java Spark Connector - Generic Spark DataSource V2 connector that registers DataFusion - TableProviders via FFI. Domain bridges implement FfiProviderFactory; - this module supplies the Spark plumbing, predicate translation, - Arrow-to-Spark schema conversion, and a widening cdylib that wraps - an FFI_TableProvider in a WideningTableProvider before Spark sees - it. + Generic Spark DataSource V2 connector for DataFusion TableProviders. + Domain bridges implement BridgeProviderFactory over a cdylib built + with the datafusion-spark-bridge Rust SDK; this module supplies the + Spark plumbing, predicate translation, Arrow-to-Spark schema + conversion, and the shared-scan cache. Pure JVM artifact — the + native code ships inside each bridge's own jar. 2.13 2.13.14 3.5.7 - debug @@ -128,32 +127,6 @@ under the License. true - - org.apache.maven.plugins - maven-antrun-plugin - 3.1.0 - - - copy-widening-cdylib - process-classes - run - - - - - - - - - - - - - org.scalatest scalatest-maven-plugin @@ -174,81 +147,4 @@ under the License. - - - - native-linux-amd64 - - unixlinuxamd64 - - - linux - x86_64 - libdatafusion_spark_helper.so - - - - native-linux-x86_64 - - unixlinuxx86_64 - - - linux - x86_64 - libdatafusion_spark_helper.so - - - - native-linux-aarch64 - - unixlinuxaarch64 - - - linux - aarch64 - libdatafusion_spark_helper.so - - - - native-mac-x86_64 - - macx86_64 - - - darwin - x86_64 - libdatafusion_spark_helper.dylib - - - - native-mac-amd64 - - macamd64 - - - darwin - x86_64 - libdatafusion_spark_helper.dylib - - - - native-mac-aarch64 - - macaarch64 - - - darwin - aarch64 - libdatafusion_spark_helper.dylib - - - diff --git a/spark/src/main/java/io/datafusion/spark/FfiProviderFactory.java b/spark/src/main/java/io/datafusion/spark/BridgeProviderFactory.java similarity index 64% rename from spark/src/main/java/io/datafusion/spark/FfiProviderFactory.java rename to spark/src/main/java/io/datafusion/spark/BridgeProviderFactory.java index 5d64a23..1231d35 100644 --- a/spark/src/main/java/io/datafusion/spark/FfiProviderFactory.java +++ b/spark/src/main/java/io/datafusion/spark/BridgeProviderFactory.java @@ -23,30 +23,28 @@ /** * Bridge interface implemented per domain (HDF5, custom Iceberg, an in-house format, etc.). A - * bridge owns its options encoding and a native scan implementation; the connector-core Spark - * plumbing is generic — it knows only this interface. + * bridge owns its options encoding and a native scan implementation built with {@code + * datafusion_spark_bridge::export_bridge!}; the connector Spark plumbing is generic — it knows only + * this interface. * - *

          Two kinds of bridge, distinguished by which method they override: - * - *

            - *
          • Static bridge (preferred when the provider's Rust source is yours): the cdylib is - * built with {@code datafusion_spark_bridge::export_bridge!} and constructs the provider from - * the options/partition bytes natively. Override {@link #scanBackend()} to delegate to the - * JNI class named in the macro; {@link #createProvider(byte[], byte[])} is never called. - *
          • FFI bridge (the provider arrives precompiled, or must stay on a different DataFusion - * version): override {@link #createProvider(byte[], byte[])} to return a raw {@code - * FFI_TableProvider} pointer; the default {@link #scanBackend()} routes it through the - * connector's own cdylib. - *
          - * - *

          Everything else has a working default: {@link #encodeOptions(Map)} encodes the Spark options - * via {@link OptionsCodec}, and {@link #listPartitions(byte[])} reports a single partition. A - * minimal bridge therefore overrides exactly one method. + *

          The single required method is {@link #scanBackend()}, returning the delegations to the JNI + * class the bridge named in its {@code export_bridge!} invocation. Everything else has a working + * default: {@link #encodeOptions(Map)} encodes the Spark options via {@link OptionsCodec}, and + * {@link #listPartitions(byte[])} reports a single partition. * *

          Implementations must be no-arg constructable so the Spark connector can instantiate them * reflectively via {@link Class#forName(String)} on the executor. */ -public interface FfiProviderFactory { +public interface BridgeProviderFactory { + + /** + * The native scan implementation this bridge talks to: delegations to the JNI class named in the + * bridge's {@code export_bridge!} invocation, whose generated {@code createScan} builds the + * provider from the options/partition bytes in process. Called wherever the connector needs + * native work — driver-side schema/plan probes and executor-side streams — always on a factory + * freshly instantiated from its class name, so the returned backend never has to be serializable. + */ + ScanBackend scanBackend(); /** * Convert Spark's flat option map to the bridge's encoded options. Driver-side only; the bytes @@ -68,9 +66,9 @@ default byte[] encodeOptions(Map sparkOptions) { * PartitionInfo}. Driver-side only. * *

          Each partition's {@code partitionBytes} ships verbatim through {@code - * DatafusionInputPartition} to the executor, where it is passed to {@link #createProvider(byte[], - * byte[])}. Use it to encode whatever slice metadata (row range, sub-options, file offsets, - * segment id, …) the bridge needs to materialise *that* partition. + * DatafusionInputPartition} to the executor, where it is passed to {@link + * ScanBackend#createScan}. Use it to encode whatever slice metadata (row range, sub-options, file + * offsets, segment id, …) the bridge needs to materialise *that* partition. * *

          Each partition's {@code preferredLocations} hostnames are returned from {@code * InputPartition.preferredLocations()} so Spark co-locates the task with the data; empty array = @@ -88,10 +86,10 @@ default PartitionInfo[] listPartitions(byte[] optionsProtoBytes) { /** * Filter-aware variant of {@link #listPartitions(byte[])}. The connector calls this overload with * the pushed-down predicates ({@code LogicalExprNode} proto bytes, one array per predicate, same - * encoding the executor later replays via {@code FfiHelperNative.createScan}). Bridges that can - * map predicates onto their partition layout (e.g. {@code segment_id = 'x'}) should prune - * partitions that cannot match — pruning here eliminates whole Spark tasks, whereas the per-task - * filter only reduces rows inside a task. + * encoding the executor later replays via {@link ScanBackend#createScan}). Bridges that can map + * predicates onto their partition layout (e.g. {@code segment_id = 'x'}) should prune partitions + * that cannot match — pruning here eliminates whole Spark tasks, whereas the per-task filter only + * reduces rows inside a task. * *

          Pruning must be conservative: only drop a partition when NO row in it can satisfy the * conjunction of all pushed predicates. The default delegates to the filter-unaware overload (no @@ -108,7 +106,7 @@ default PartitionInfo[] listPartitions(byte[] optionsProtoBytes, byte[][] filter *

          When {@code true}, the connector builds ONE provider per (executor JVM × scan) with empty * {@code partitionBytes}, plans it once, and runs one Spark task per DataFusion output partition * — task {@code i} streams plan partition {@code i} from the shared, cached plan. This amortises - * {@code createProvider} cost across all tasks on an executor and is the right model when the + * provider construction cost across all tasks on an executor and is the right model when the * dataset has many small partitions or provider construction is expensive (remote metadata, * connections). {@link #listPartitions(byte[])} and {@link #reportPartitioning(byte[])} are NOT * called in this mode, and the scan reports {@code UnknownPartitioning} (DataFusion-native @@ -135,40 +133,6 @@ default boolean sharedScan(byte[] optionsProtoBytes) { return false; } - /** - * Build the underlying {@code Arc} for one partition and wrap it in an {@code - * FFI_TableProvider}. Returns the raw {@code Box::into_raw} pointer as a {@code jlong}; the - * caller takes ownership. Only the FFI path ({@link FfiScanBackend}, the {@link #scanBackend()} - * default) calls this — static bridges override {@link #scanBackend()} instead and leave this - * default in place. - * - * @param optionsProtoBytes global options produced by {@link #encodeOptions(Map)} - * @param partitionBytes per-partition slice payload from {@link PartitionInfo#partitionBytes()}. - * Empty array for single-partition tables and for the driver-side schema probe in {@code - * DatafusionSource.inferSchema}. - */ - default long createProvider(byte[] optionsProtoBytes, byte[] partitionBytes) { - throw new UnsupportedOperationException( - getClass().getName() - + " uses the default FFI scan backend but does not implement createProvider. " - + "Override createProvider (FFI bridge) or scanBackend (static export_bridge! " - + "bridge)."); - } - - /** - * The native scan implementation this bridge talks to. Called wherever the connector needs native - * work — driver-side schema/plan probes and executor-side streams — always on a factory freshly - * instantiated from its class name, so the returned backend never has to be serializable. - * - *

          Default: the generic FFI path ({@link FfiScanBackend} over {@link #createProvider(byte[], - * byte[])} and the connector's own cdylib). Static bridges built with {@code - * datafusion_spark_bridge::export_bridge!} override this to return a backend that loads their - * cdylib and delegates each method to the JNI class named in the macro invocation. - */ - default ScanBackend scanBackend() { - return new FfiScanBackend(this); - } - /** * Declare how rows are partitioned across the {@link PartitionInfo} entries returned by {@link * #listPartitions(byte[])}. Driver-side only. diff --git a/spark/src/main/java/io/datafusion/spark/FfiHelperNative.java b/spark/src/main/java/io/datafusion/spark/FfiHelperNative.java deleted file mode 100644 index fe93fb6..0000000 --- a/spark/src/main/java/io/datafusion/spark/FfiHelperNative.java +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package io.datafusion.spark; - -/** - * JNI surface of the connector cdylib ({@code libdatafusion_spark_helper.{so,dylib}}). - * - *

          The cdylib owns the whole DataFusion side of a scan: it takes an {@code FFI_TableProvider} - * pointer produced by a bridge, wraps the provider in a {@code WideningTableProvider} (kernel-level - * {@code arrow::compute::cast} for Spark-incompatible Arrow types), registers it on a private - * {@code SessionContext} built from the driver-pinned config, applies the pruned projection and the - * proto-encoded pushed filters, plans once, and streams plan partitions back over {@code - * FFI_ArrowArrayStream}. - * - *

          Errors throw the typed {@code org.apache.datafusion.*} exception hierarchy (from the - * datafusion-java core jar, a compile dependency of this module). - * - *

          The native library is loaded once per JVM via {@link NativeLibraryLoader}. The library payload - * lives inside this jar under {@code io/datafusion/spark///} and is extracted to a temp - * file before {@link System#load}. - */ -public final class FfiHelperNative { - - private FfiHelperNative() {} - - static { - NativeLibraryLoader.loadLibrary("datafusion_spark_helper"); - } - - /** - * Driver-side schema probe: the widened Arrow schema of the provider, serialized as Arrow IPC - * bytes (deserialize with {@code MessageSerializer.deserializeSchema}). - * - *

          Takes ownership of {@code ffiProviderRawPtr}; the provider is dropped before returning and - * the pointer must not be reused. - */ - public static native byte[] providerSchemaIpc(long ffiProviderRawPtr); - - /** - * Build a planned scan over the provider and return its handle. - * - *

          Takes ownership of {@code ffiProviderRawPtr}. {@code targetPartitions} / {@code batchSize} - * {@code <= 0} leave the DataFusion defaults; {@code optionKeys}/{@code optionValues} are - * parallel arrays of DataFusion config overrides; an empty {@code projectionColumns} selects all - * columns; each element of {@code filterProtos} is a serialized {@code - * datafusion.LogicalExprNode} applied as a filter. - * - *

          The caller owns the returned handle and must pair it with {@link #closeScan(long)}. Closing - * while a stream opened from this handle is still in flight is undefined behaviour — the - * shared-scan cache's refcount enforces this; any other caller must serialize close itself. - */ - public static native long createScan( - long ffiProviderRawPtr, - int targetPartitions, - int batchSize, - String[] optionKeys, - String[] optionValues, - String[] projectionColumns, - byte[][] filterProtos); - - /** Output partition count of the planned physical plan. */ - public static native int partitionCount(long scanHandle); - - /** - * Open an independent stream over ONE plan partition, writing an {@code FFI_ArrowArrayStream} - * into the caller-allocated struct at {@code ffiStreamAddr}. Concurrent-safe across JVM threads. - */ - public static native void executeStreamPartition( - long scanHandle, int partition, long ffiStreamAddr); - - /** - * Stream the WHOLE plan (all partitions coalesced) into the caller-allocated {@code - * FFI_ArrowArrayStream} at {@code ffiStreamAddr}. Used by legacy per-partition payload mode, - * where the provider itself already represents the task's slice. - */ - public static native void executeStream(long scanHandle, long ffiStreamAddr); - - /** Drop the planned scan. See {@link #createScan} for the close-vs-in-flight-stream contract. */ - public static native void closeScan(long scanHandle); -} diff --git a/spark/src/main/java/io/datafusion/spark/NativeLibraryLoader.java b/spark/src/main/java/io/datafusion/spark/NativeLibraryLoader.java index 6c7ecd5..eb4766a 100644 --- a/spark/src/main/java/io/datafusion/spark/NativeLibraryLoader.java +++ b/spark/src/main/java/io/datafusion/spark/NativeLibraryLoader.java @@ -39,11 +39,9 @@ * where {@code } is one of {@code linux}, {@code darwin}, {@code windows} and {@code } is * {@code x86_64} or {@code aarch64}. * - *

          The connector loads its own cdylib through this class (prefix {@code io/datafusion/spark}); - * bridges are encouraged to reuse it via {@link #load(Class, String, String)} from their native - * class's static initializer, with their own resource prefix, instead of hand-rolling extraction. - * Bundle the cdylib with the same antrun-copy pattern the connector's pom uses (see "Packaging your - * bridge" in {@code spark/README.md}). + *

          Bridges call {@link #load(Class, String, String)} from their native class's static + * initializer, with their own resource prefix, instead of hand-rolling extraction. Bundle the + * cdylib with the antrun-copy pattern shown in "Packaging your bridge" in {@code spark/README.md}. */ public final class NativeLibraryLoader { @@ -52,11 +50,6 @@ public final class NativeLibraryLoader { private NativeLibraryLoader() {} - /** Connector-internal entry: loads from the connector jar's own prefix. */ - static void loadLibrary(String name) { - load(NativeLibraryLoader.class, "io/datafusion/spark", name); - } - /** * Extract {@code ///} from {@code anchor}'s classloader * and {@link System#load} it. Idempotent per (prefix, name): repeated calls — e.g. one per Spark diff --git a/spark/src/main/java/io/datafusion/spark/OptionsCodec.java b/spark/src/main/java/io/datafusion/spark/OptionsCodec.java index 9b1feca..0d16a28 100644 --- a/spark/src/main/java/io/datafusion/spark/OptionsCodec.java +++ b/spark/src/main/java/io/datafusion/spark/OptionsCodec.java @@ -27,8 +27,8 @@ import java.util.TreeMap; /** - * Default wire format for {@link FfiProviderFactory#encodeOptions(Map)}: the Spark options map as - * length-prefixed UTF-8 pairs, sorted by key. + * Default wire format for {@link BridgeProviderFactory#encodeOptions(Map)}: the Spark options map + * as length-prefixed UTF-8 pairs, sorted by key. * *

          Layout (all integers big-endian {@code int32}): entry count, then per entry key length, key * bytes, value length, value bytes. Key-sorting makes the bytes a pure function of the map's diff --git a/spark/src/main/java/io/datafusion/spark/PartitionInfo.java b/spark/src/main/java/io/datafusion/spark/PartitionInfo.java index 1dea72e..67376db 100644 --- a/spark/src/main/java/io/datafusion/spark/PartitionInfo.java +++ b/spark/src/main/java/io/datafusion/spark/PartitionInfo.java @@ -21,8 +21,8 @@ /** * Driver-side descriptor for a single partition produced by {@link - * FfiProviderFactory#listPartitions(byte[])}. Carries the bridge-specific slice payload that the - * executor passes back into {@link FfiProviderFactory#createProvider(byte[], byte[])}, plus + * BridgeProviderFactory#listPartitions(byte[])}. Carries the bridge-specific slice payload that the + * executor passes back into {@link BridgeProviderFactory#createProvider(byte[], byte[])}, plus * optional host hints for Spark's scheduler. * *

          Fields: @@ -32,13 +32,13 @@ * Surfaces in Spark UI, logs, and exception messages. Must be non-empty. *

        • {@code partitionBytes} — opaque per-partition payload. Bridge encodes whatever the executor * needs to materialise *this* slice (offsets, row ranges, sub-options, etc.). Combined with - * the global {@code optionsProtoBytes} in {@link FfiProviderFactory#createProvider(byte[], + * the global {@code optionsProtoBytes} in {@link BridgeProviderFactory#createProvider(byte[], * byte[])}. Empty array = no per-partition state (single-partition table). *
        • {@code preferredLocations} — hostnames where this partition's data lives. Returned from * {@code InputPartition.preferredLocations()} so Spark can co-locate the task with the data. * Empty array = no preference. Honoured subject to {@code spark.locality.wait}. *
        • {@code partitionKeyValues} — optional values of the partitioning keys for every row in this - * partition, in the same order as {@link FfiProviderFactory#reportPartitioning(byte[])}'s + * partition, in the same order as {@link BridgeProviderFactory#reportPartitioning(byte[])}'s * declared transforms. {@code null} = no key (the default). When the bridge reports a * partitioning AND every partition carries key values, the connector exposes them to Spark * via {@code HasPartitionKey} — required on Spark 3.3+ for the reported {@code diff --git a/spark/src/main/java/io/datafusion/spark/ReportedPartitioning.java b/spark/src/main/java/io/datafusion/spark/ReportedPartitioning.java index 01fbd1b..639fec9 100644 --- a/spark/src/main/java/io/datafusion/spark/ReportedPartitioning.java +++ b/spark/src/main/java/io/datafusion/spark/ReportedPartitioning.java @@ -26,13 +26,13 @@ /** * Driver-side declaration of how a bridge's data is partitioned on the key columns. When supplied - * via {@link FfiProviderFactory#reportPartitioning(byte[])}, the connector surfaces a {@link + * via {@link BridgeProviderFactory#reportPartitioning(byte[])}, the connector surfaces a {@link * org.apache.spark.sql.connector.read.partitioning.KeyGroupedPartitioning} from {@link * org.apache.spark.sql.connector.read.SupportsReportPartitioning#outputPartitioning()} — Spark's * optimizer can then skip the shuffle ahead of joins/aggregations whose grouping keys line up with * these transforms. * - *

          Contract: for any partition reported by {@link FfiProviderFactory#listPartitions(byte[])}, + *

          Contract: for any partition reported by {@link BridgeProviderFactory#listPartitions(byte[])}, * every row produced by that partition must evaluate to the same tuple of key values under these * transforms. Different partitions may share key values (Spark will fuse them); they must * not straddle key values. diff --git a/spark/src/main/java/io/datafusion/spark/ScanBackend.java b/spark/src/main/java/io/datafusion/spark/ScanBackend.java index 21b9873..63ef0b5 100644 --- a/spark/src/main/java/io/datafusion/spark/ScanBackend.java +++ b/spark/src/main/java/io/datafusion/spark/ScanBackend.java @@ -20,19 +20,10 @@ package io.datafusion.spark; /** - * Native scan surface the connector plumbing talks to. One method per JNI entry point of the {@code - * datafusion-spark-bridge} scan machinery; implementations only differ in which native - * library and class the calls land on: - * - *

            - *
          • {@link FfiScanBackend} (the {@link FfiProviderFactory#scanBackend()} default) builds the - * provider via {@link FfiProviderFactory#createProvider(byte[], byte[])} and routes through - * the connector's own cdylib ({@link FfiHelperNative}) — the generic FFI path. - *
          • A static bridge supplies its own implementation delegating to the class it named in its - * {@code export_bridge!} invocation, whose generated {@code createScan} builds the provider - * from {@code options}/{@code partitionBytes} directly — no pointer handover, no {@code - * datafusion-ffi}. - *
          + * Native scan surface the connector plumbing talks to: one method per JNI entry point generated by + * the bridge's {@code datafusion_spark_bridge::export_bridge!} invocation. A bridge's + * implementation is six one-line delegations to the JNI class named in that macro, whose {@code + * createScan} builds the provider from {@code options}/{@code partitionBytes} in process. * *

          Implementations must be stateless or thread-safe: the driver probes schemas and plans through * one instance while executor tasks stream through others, and scan handles are shared across diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionBatch.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionBatch.scala index c2ec384..a97ecbe 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionBatch.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionBatch.scala @@ -100,7 +100,7 @@ private[spark] object DatafusionBatch { } if (withKeys != partitions.length) { throw new IllegalStateException( - s"FfiProviderFactory '$factoryFqcn' reported a partitioning but only $withKeys of " + + s"BridgeProviderFactory '$factoryFqcn' reported a partitioning but only $withKeys of " + s"${partitions.length} PartitionInfo entries carry partitionKeyValues; either all " + "partitions must carry key values or none") } diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionColumnarPartitionReader.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionColumnarPartitionReader.scala index 6e59b3d..678d258 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionColumnarPartitionReader.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionColumnarPartitionReader.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.vectorized.ColumnarBatch /** * Per-task columnar reader for the per-partition payload (legacy) path. Lifecycle: * - * 1. Reflectively instantiate the bridge's `FfiProviderFactory` (no-arg) and take its + * 1. Reflectively instantiate the bridge's `BridgeProviderFactory` (no-arg) and take its * [[ScanBackend]]. * 2. `backend.createScan(options, partitionBytes, ...)` — builds the provider for the slice * described by `partitionBytes` and does the rest natively: widening wrap, private @@ -90,8 +90,8 @@ class DatafusionColumnarPartitionReader( if (first != null) throw first } - private def instantiateFactory(fqcn: String): FfiProviderFactory = { + private def instantiateFactory(fqcn: String): BridgeProviderFactory = { val cls = Class.forName(fqcn) - cls.getDeclaredConstructor().newInstance().asInstanceOf[FfiProviderFactory] + cls.getDeclaredConstructor().newInstance().asInstanceOf[BridgeProviderFactory] } } diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionInputPartition.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionInputPartition.scala index 2b221e5..decd5c8 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionInputPartition.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionInputPartition.scala @@ -31,14 +31,14 @@ sealed trait DatafusionPartition extends InputPartition /** * Per-task payload for the per-partition payload (legacy) read path. * - * - `factoryFqcn`: fully-qualified class name of the bridge's `FfiProviderFactory`. The + * - `factoryFqcn`: fully-qualified class name of the bridge's `BridgeProviderFactory`. The * executor reflectively instantiates this and calls `createProvider(optionsProtoBytes, * partitionBytes)`. * - `optionsProtoBytes`: bridge-specific global connection options, encoded by the bridge. * Opaque to connector-core. Same bytes ride along on every partition. * - `projectionColumnNames`: pruned column list (post-`pruneColumns`). * - `filterProtoBytes`: V2 `Predicate` → DataFusion `LogicalExprNode` proto bytes; each one is - * applied natively via `FfiHelperNative.createScan`. + * applied natively via `ScanBackend.createScan`. * - `partitionId`: stable identifier (e.g. a segment or file id) — surfaces in Spark UI/logs/errors. * - `partitionBytes`: opaque per-partition payload from `PartitionInfo.partitionBytes`. Passed * back into `createProvider` so the bridge materialises *this* slice. diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionScan.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionScan.scala index 80f35bd..4c8b47f 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionScan.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionScan.scala @@ -46,7 +46,7 @@ final case class LegacyMode( /** * Shared-scan mode: one cached provider + plan per (executor × scan), `numPartitions` tasks each - * driving one DataFusion output partition. See [[FfiProviderFactory#sharedScan]] for the + * driving one DataFusion output partition. See [[BridgeProviderFactory#sharedScan]] for the * determinism contract. */ final case class SharedScanMode( @@ -59,7 +59,7 @@ final case class SharedScanMode( /** * Read plan for a DataFusion-backed scan. Holds pruning state, the pushed predicates (for * `description()` / `explain(True)`), the corresponding `LogicalExprNode` proto byte arrays the - * executor applies natively via `FfiHelperNative.createScan`, and the driver-resolved + * executor applies natively via `ScanBackend.createScan`, and the driver-resolved * [[DatafusionScanMode]]. * * Legacy mode with a bridge-declared [[ReportedPartitioning]] surfaces `KeyGroupedPartitioning` diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionScanBuilder.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionScanBuilder.scala index 45eaa87..8272f6d 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionScanBuilder.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionScanBuilder.scala @@ -99,12 +99,12 @@ class DatafusionScanBuilder( ) } - private def buildLegacyMode(factory: FfiProviderFactory): LegacyMode = { + private def buildLegacyMode(factory: BridgeProviderFactory): LegacyMode = { val partitions: Array[PartitionInfo] = factory.listPartitions(optionsProtoBytes, pushedBytes) if (partitions == null || partitions.isEmpty) { throw new IllegalStateException( - s"FfiProviderFactory '$factoryFqcn' returned no partitions to scan" + s"BridgeProviderFactory '$factoryFqcn' returned no partitions to scan" ) } LegacyMode(partitions, factory.reportPartitioning(optionsProtoBytes)) @@ -144,8 +144,8 @@ class DatafusionScanBuilder( SharedScanMode(scanId, numPartitions, pinned, idleTtlMs) } - private def instantiateFactory(fqcn: String): FfiProviderFactory = { + private def instantiateFactory(fqcn: String): BridgeProviderFactory = { val cls = Class.forName(fqcn) - cls.getDeclaredConstructor().newInstance().asInstanceOf[FfiProviderFactory] + cls.getDeclaredConstructor().newInstance().asInstanceOf[BridgeProviderFactory] } } diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionSource.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionSource.scala index 320cc76..09fd652 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionSource.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionSource.scala @@ -46,7 +46,7 @@ class DatafusionSource extends TableProvider with DataSourceRegister { override def shortName(): String = "datafusion" - /** Spark option key carrying the FfiProviderFactory FQCN when no override is provided. */ + /** Spark option key carrying the BridgeProviderFactory FQCN when no override is provided. */ protected val FactoryOptionKey: String = "df.factory" /** @@ -88,8 +88,8 @@ class DatafusionSource extends TableProvider with DataSourceRegister { override def supportsExternalMetadata(): Boolean = false - private def instantiateFactory(fqcn: String): FfiProviderFactory = { + private def instantiateFactory(fqcn: String): BridgeProviderFactory = { val cls = Class.forName(fqcn) - cls.getDeclaredConstructor().newInstance().asInstanceOf[FfiProviderFactory] + cls.getDeclaredConstructor().newInstance().asInstanceOf[BridgeProviderFactory] } } diff --git a/spark/src/main/scala/io/datafusion/spark/NativeSharedScanResources.scala b/spark/src/main/scala/io/datafusion/spark/NativeSharedScanResources.scala index dbf7e02..77350a7 100644 --- a/spark/src/main/scala/io/datafusion/spark/NativeSharedScanResources.scala +++ b/spark/src/main/scala/io/datafusion/spark/NativeSharedScanResources.scala @@ -72,7 +72,7 @@ private[spark] object NativeSharedScanResources extends Logging { .forName(spec.factoryFqcn) .getDeclaredConstructor() .newInstance() - .asInstanceOf[FfiProviderFactory] + .asInstanceOf[BridgeProviderFactory] val backend = factory.scanBackend() val allocator = new RootAllocator(Long.MaxValue) diff --git a/spark/src/main/scala/io/datafusion/spark/PinnedSessionConfig.scala b/spark/src/main/scala/io/datafusion/spark/PinnedSessionConfig.scala index 7fc21ea..1340978 100644 --- a/spark/src/main/scala/io/datafusion/spark/PinnedSessionConfig.scala +++ b/spark/src/main/scala/io/datafusion/spark/PinnedSessionConfig.scala @@ -30,7 +30,7 @@ import org.apache.spark.sql.internal.SQLConf * executor — and partition-indexed execution would silently drop or duplicate data. The driver * resolves these values once in `DatafusionScanBuilder.build()`, ships them inside every * [[DatafusionSharedScanPartition]], and both the driver probe and the executors hand the same - * values to `FfiHelperNative.createScan`, which builds the native `SessionContext` from them. + * values to `ScanBackend.createScan`, which builds the native `SessionContext` from them. * * `options` additionally disables the optimizer's plan-reshaping repartition passes so the * physical partitioning is exactly what the provider's `scan()` reports, on every machine. diff --git a/spark/src/main/scala/io/datafusion/spark/SharedScanPartitionReader.scala b/spark/src/main/scala/io/datafusion/spark/SharedScanPartitionReader.scala index 01f62a2..f7bad10 100644 --- a/spark/src/main/scala/io/datafusion/spark/SharedScanPartitionReader.scala +++ b/spark/src/main/scala/io/datafusion/spark/SharedScanPartitionReader.scala @@ -48,7 +48,7 @@ class SharedScanPartitionReader( s"shared-scan determinism violation for scanId=${partition.scanId}: driver planned " + s"${partition.numPartitions} partition(s) but this executor planned $executorCount. " + "The provider's partitioning must be a pure function of optionsProtoBytes; pin your " + - "source snapshot (see FfiProviderFactory.sharedScan).") + "source snapshot (see BridgeProviderFactory.sharedScan).") } private val taskAllocator: BufferAllocator = { diff --git a/spark/src/test/scala/io/datafusion/spark/FfiProviderFactoryDefaultsTest.scala b/spark/src/test/scala/io/datafusion/spark/BridgeProviderFactoryDefaultsTest.scala similarity index 68% rename from spark/src/test/scala/io/datafusion/spark/FfiProviderFactoryDefaultsTest.scala rename to spark/src/test/scala/io/datafusion/spark/BridgeProviderFactoryDefaultsTest.scala index ff6c8ef..79f00cb 100644 --- a/spark/src/test/scala/io/datafusion/spark/FfiProviderFactoryDefaultsTest.scala +++ b/spark/src/test/scala/io/datafusion/spark/BridgeProviderFactoryDefaultsTest.scala @@ -21,20 +21,45 @@ package io.datafusion.spark import org.scalatest.funsuite.AnyFunSuite -class FfiProviderFactoryDefaultsTest extends AnyFunSuite { +class BridgeProviderFactoryDefaultsTest extends AnyFunSuite { + + /** Backend stub: the defaults under test never touch native code. */ + private object StubBackend extends ScanBackend { + def providerSchemaIpc(options: Array[Byte], partitionBytes: Array[Byte]): Array[Byte] = + throw new UnsupportedOperationException + def createScan( + options: Array[Byte], + partitionBytes: Array[Byte], + targetPartitions: Int, + batchSize: Int, + optionKeys: Array[String], + optionValues: Array[String], + projectionColumns: Array[String], + filterProtos: Array[Array[Byte]]): Long = throw new UnsupportedOperationException + def partitionCount(scanHandle: Long): Int = throw new UnsupportedOperationException + def executeStreamPartition(scanHandle: Long, partition: Int, ffiStreamAddr: Long): Unit = + throw new UnsupportedOperationException + def executeStream(scanHandle: Long, ffiStreamAddr: Long): Unit = + throw new UnsupportedOperationException + def closeScan(scanHandle: Long): Unit = throw new UnsupportedOperationException + } /** Factory overriding only listPartitions (to spy on its inputs). */ - private class MinimalFactory extends FfiProviderFactory { + private class MinimalFactory extends BridgeProviderFactory { var lastListPartitionsOpts: Array[Byte] = _ + override def scanBackend(): ScanBackend = StubBackend + override def listPartitions(optionsProtoBytes: Array[Byte]): Array[PartitionInfo] = { lastListPartitionsOpts = optionsProtoBytes Array(new PartitionInfo("p0", Array.emptyByteArray, Array.empty[String])) } } - /** Every method left at its default — the literal minimum a bridge can ship. */ - private class EmptyFactory extends FfiProviderFactory + /** Only the required method implemented — the literal minimum a bridge can ship. */ + private class EmptyFactory extends BridgeProviderFactory { + override def scanBackend(): ScanBackend = StubBackend + } test("sharedScan defaults to false") { assert(!new MinimalFactory().sharedScan(Array[Byte](1, 2, 3))) @@ -56,17 +81,6 @@ class FfiProviderFactoryDefaultsTest extends AnyFunSuite { assert(partitions(0).preferredLocations().isEmpty) } - test("default createProvider rejects with guidance toward scanBackend") { - val e = intercept[UnsupportedOperationException] { - new EmptyFactory().createProvider(Array.emptyByteArray, Array.emptyByteArray) - } - assert(e.getMessage.contains("scanBackend")) - } - - test("default scanBackend is the FFI path") { - assert(new EmptyFactory().scanBackend().isInstanceOf[FfiScanBackend]) - } - test("filter-aware listPartitions delegates to the filter-unaware overload") { val factory = new MinimalFactory val opts = Array[Byte](7, 8) From 13bca92dc1aa8398b892f148f4a2953fd4a89c66 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 11 Jun 2026 18:01:20 +0200 Subject: [PATCH 18/22] docs: scrub dual-path language after FFI removal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Doc comments still described two binding styles, a connector cdylib, and a separate widening library — none of which exist. Module docs, javadoc, error messages, and READMEs now describe the single path: every bridge cdylib is an export_bridge! expansion over the datafusion-spark-bridge SDK, widening included. Co-Authored-By: Claude Fable 5 --- examples/python/README.md | 12 ++++++------ native-common/src/lib.rs | 9 +++++---- native/src/lib.rs | 4 ++-- spark/README.md | 3 ++- spark/bridge/src/lib.rs | 2 +- spark/bridge/src/scan.rs | 12 ++++++------ .../io/datafusion/spark/ArrowToSparkSchema.scala | 11 ++++++----- 7 files changed, 28 insertions(+), 25 deletions(-) diff --git a/examples/python/README.md b/examples/python/README.md index b272d64..c9a335d 100644 --- a/examples/python/README.md +++ b/examples/python/README.md @@ -110,16 +110,16 @@ root +---+-----+ ``` -Filter row count drops from 4 → 2 because the predicate is pushed across the -FFI boundary as a `LogicalExprNode` proto and applied inside DataFusion before -Arrow batches cross back to Spark. +Filter row count drops from 4 → 2 because the predicate is pushed into the +bridge cdylib as a `LogicalExprNode` proto and applied inside DataFusion +before Arrow batches cross back to Spark. ## Notes - `master("local[2]")` keeps driver + executor in one JVM so the example - cdylib loads once. Cluster mode would need the cdylib pre-staged on every - worker (the widening lib is bundled in `datafusion-java-spark`; only the - per-bridge example lib is not). + cdylib loads once. In cluster mode nothing extra is needed: the bridge + cdylib travels inside the examples jar and `NativeLibraryLoader` extracts + it on every worker. - `extraClassPath` (not `--packages` / `userClassPathFirst`) is used because the Spark distro ships Arrow 12, flatbuffers 1.12, and protobuf 2.5, all of which we need to override; userClassPathFirst splits Netty across two diff --git a/native-common/src/lib.rs b/native-common/src/lib.rs index 01227fd..f143d43 100644 --- a/native-common/src/lib.rs +++ b/native-common/src/lib.rs @@ -15,10 +15,11 @@ // specific language governing permissions and limitations // under the License. -//! JNI plumbing shared by this workspace's cdylibs (`datafusion-jni` and the -//! Spark connector helper): the error-to-Java-exception mapping, the -//! per-cdylib Tokio runtime singleton, and the async-stream-to- -//! `FFI_ArrowArrayStream` bridge. +//! JNI plumbing shared by this workspace's native crates (`datafusion-jni` +//! and `datafusion-spark-bridge`, and through the latter every bridge +//! cdylib): the error-to-Java-exception mapping, the per-cdylib Tokio +//! runtime singleton, and the async-stream-to-`FFI_ArrowArrayStream` +//! bridge. //! //! Each cdylib statically links its own copy of this rlib, so [`runtime`] is //! a per-cdylib singleton -- exactly the behaviour each crate had when this diff --git a/native/src/lib.rs b/native/src/lib.rs index 6fb1eb1..6e1a79f 100644 --- a/native/src/lib.rs +++ b/native/src/lib.rs @@ -82,8 +82,8 @@ pub(crate) fn jvm() -> &'static JavaVM { pub(crate) fn runtime() -> &'static Runtime { // The singleton itself lives in datafusion-jni-common (shared with the - // Spark helper cdylib; each cdylib statically links its own copy, so the - // runtime stays per-library). The init hook eagerly installs the + // datafusion-spark-bridge SDK; each cdylib statically links its own + // copy, so the runtime stays per-library). The init hook eagerly installs the // runtime-metrics accumulator (no-op when the `runtime-metrics` Cargo // feature is off). Initialising here -- not lazily on the first // `runtimeStats()` call -- means the RuntimeMonitor's sampling baseline diff --git a/spark/README.md b/spark/README.md index bbbf48c..17147d6 100644 --- a/spark/README.md +++ b/spark/README.md @@ -380,7 +380,8 @@ Shared-scan operational details: - **Schema inference** — your provider's Arrow schema, widened, becomes the Spark schema. Driver-side, one probe build with empty `partitionBytes`. - **Type widening** — Spark's columnar readers reject several Arrow types - DataFusion happily produces. The connector cdylib transparently casts + DataFusion happily produces. The SDK (inside your bridge's cdylib) + transparently casts unsigned ints → wider signed, `Float16` → `Float32`, `Time*` → wider ints, any-unit/tz `Timestamp` → microsecond, recursively through `List`/`LargeList`/`FixedSizeList` (see diff --git a/spark/bridge/src/lib.rs b/spark/bridge/src/lib.rs index 90e5e24..52ef1c1 100644 --- a/spark/bridge/src/lib.rs +++ b/spark/bridge/src/lib.rs @@ -75,7 +75,7 @@ pub(crate) fn runtime_handle() -> &'static Handle { datafusion_jni_common::runtime().handle() } -/// Generate the JNI entry points for a static bridge cdylib. +/// Generate the JNI entry points for a bridge cdylib. /// /// `jni_class` is the **underscore-mangled** binary name of the Java class /// declaring the matching `native` methods: dots become underscores diff --git a/spark/bridge/src/scan.rs b/spark/bridge/src/scan.rs index 43f580d..8d67b14 100644 --- a/spark/bridge/src/scan.rs +++ b/spark/bridge/src/scan.rs @@ -15,13 +15,13 @@ // specific language governing permissions and limitations // under the License. -//! Planning and execution of a Spark scan, provider-source-agnostic. +//! Planning and execution of a Spark scan. //! -//! Every function here is the body of one JNI entry point; the caller (the -//! generic FFI cdylib, or a static bridge's `export_bridge!` expansion) -//! supplies only how the provider is obtained, as a `make` closure. The -//! provider is wrapped in a [`WideningTableProvider`] here, so both binding -//! styles get identical Spark-compatible Arrow types. +//! Every function here is the body of one JNI entry point generated by a +//! bridge's `export_bridge!` expansion, which supplies only how the provider +//! is obtained, as a `make` closure. The provider is wrapped in a +//! [`WideningTableProvider`] here, so every bridge gets identical +//! Spark-compatible Arrow types. //! //! [`create_scan`] registers the widened provider on a private //! `SessionContext` built from the caller-pinned config, applies the pruned diff --git a/spark/src/main/scala/io/datafusion/spark/ArrowToSparkSchema.scala b/spark/src/main/scala/io/datafusion/spark/ArrowToSparkSchema.scala index eac18b6..2e8f1a5 100644 --- a/spark/src/main/scala/io/datafusion/spark/ArrowToSparkSchema.scala +++ b/spark/src/main/scala/io/datafusion/spark/ArrowToSparkSchema.scala @@ -35,9 +35,10 @@ import org.apache.spark.sql.types._ * Null. No unsigned-int or Time accessor exists; we surface a clear error at schema discovery * for those — the alternative is silent corruption. * - * The widening cdylib (connector-core/native/) inserts a `WideningTableProvider` upstream of the - * Spark reader that casts unsupported types kernel-side (UInt*→signed wider, Float16→Float32, - * non-µs Timestamp→µs Timestamp, Time→Int) so Spark only ever sees compatible Arrow types. + * The widening layer (datafusion-spark-bridge, compiled into every bridge cdylib) inserts a + * `WideningTableProvider` upstream of the Spark reader that casts unsupported types kernel-side + * (UInt*→signed wider, Float16→Float32, non-µs Timestamp→µs Timestamp, Time→Int) so Spark only + * ever sees compatible Arrow types. */ object ArrowToSparkSchema { @@ -67,7 +68,7 @@ object ArrowToSparkSchema { unsupported( f, s"unsigned integer UInt$bits (Spark ArrowColumnVector has no unsigned accessor; " + - "widening cdylib casts these before Spark sees them — this branch indicates the " + + "widening layer casts these before Spark sees them — this branch indicates the " + "WideningTableProvider was bypassed)" ) case (bits, signed) => unsupported(f, s"Int(bits=$bits, signed=$signed)") @@ -76,7 +77,7 @@ object ArrowToSparkSchema { case t: ArrowType.FloatingPoint => t.getPrecision match { case FloatingPointPrecision.HALF => - unsupported(f, "Float16 (widening cdylib must cast to Float32 before Spark)") + unsupported(f, "Float16 (widening layer must cast to Float32 before Spark)") case FloatingPointPrecision.SINGLE => FloatType case FloatingPointPrecision.DOUBLE => DoubleType case other => unsupported(f, s"FloatingPoint($other)") From a1815a8d97fdc026d38eaeabb89b9285793a51bd Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 11 Jun 2026 19:45:00 +0200 Subject: [PATCH 19/22] refactor(spark): rename optionsProtoBytes, fix stale createProvider docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The default options encoding is OptionsCodec key/value strings, not protobuf, so optionsProtoBytes was misleading. Rename to optionsBytes across main, test, and examples. filterProtoBytes is left as-is — those genuinely are LogicalExprNode proto bytes. Also fix five doc references to the removed createProvider method (now ScanBackend.createScan): two broken {@link}s in PartitionInfo.java that would fail -Xdoclint, two comments in DatafusionInputPartition.scala, and the bridge_demo.py note (which also claimed a non-existent stdout line — reworded to the native build_provider). Co-Authored-By: Claude Opus 4.8 (1M context) --- examples/python/bridge_demo.py | 4 ++-- .../examples/ExampleBridgeProviderFactory.java | 18 +++++++++--------- spark/README.md | 8 ++++---- .../spark/BridgeProviderFactory.java | 12 ++++++------ .../io/datafusion/spark/PartitionInfo.java | 6 +++--- .../io/datafusion/spark/DatafusionBatch.scala | 4 ++-- .../DatafusionColumnarPartitionReader.scala | 2 +- .../spark/DatafusionInputPartition.scala | 14 +++++++------- .../io/datafusion/spark/DatafusionScan.scala | 2 +- .../spark/DatafusionScanBuilder.scala | 12 ++++++------ .../io/datafusion/spark/DatafusionSource.scala | 2 +- .../io/datafusion/spark/DatafusionTable.scala | 4 ++-- .../spark/NativeSharedScanResources.scala | 2 +- .../io/datafusion/spark/SharedScanCache.scala | 2 +- .../spark/SharedScanPartitionReader.scala | 2 +- .../BridgeProviderFactoryDefaultsTest.scala | 4 ++-- .../datafusion/spark/SharedScanCacheTest.scala | 2 +- 17 files changed, 50 insertions(+), 50 deletions(-) diff --git a/examples/python/bridge_demo.py b/examples/python/bridge_demo.py index bf3f6e6..a630224 100644 --- a/examples/python/bridge_demo.py +++ b/examples/python/bridge_demo.py @@ -222,8 +222,8 @@ def main() -> None: # Note on cache scope: the executor cache is keyed by a per-query scanId, # so sharing happens across the TASKS of one query (4 tasks above -> one - # provider build per executor JVM, observable via the factory's - # createProvider stdout line), not across separate actions. Each new + # provider build per executor JVM, in the bridge's native build_provider), + # not across separate actions. Each new # action plans a new scan with a fresh scanId; its entry simply joins the # cache until the idle TTL evicts it. count_again = shared.count() diff --git a/examples/src/main/java/org/apache/datafusion/examples/ExampleBridgeProviderFactory.java b/examples/src/main/java/org/apache/datafusion/examples/ExampleBridgeProviderFactory.java index 27391fa..5b4c921 100644 --- a/examples/src/main/java/org/apache/datafusion/examples/ExampleBridgeProviderFactory.java +++ b/examples/src/main/java/org/apache/datafusion/examples/ExampleBridgeProviderFactory.java @@ -60,7 +60,7 @@ *

        * *

        Real bridges (HDF5, custom Iceberg, in-house formats) use a protobuf schema for {@code - * optionsProtoBytes}; this example uses a hand-rolled length-prefixed binary format to keep the + * optionsBytes}; this example uses a hand-rolled length-prefixed binary format to keep the * wire layer obvious: * *

        @@ -112,31 +112,31 @@ public byte[] encodeOptions(Map sparkOptions) {
           }
         
           @Override
        -  public PartitionInfo[] listPartitions(byte[] optionsProtoBytes) {
        +  public PartitionInfo[] listPartitions(byte[] optionsBytes) {
             // Single partition; the example MemTable is not actually sliced. A real bridge would
             // populate `partitionBytes` per slice and `preferredLocations` with the hosts holding it.
             return new PartitionInfo[] {new PartitionInfo("p0", new byte[0], new String[0])};
           }
         
           @Override
        -  public PartitionInfo[] listPartitions(byte[] optionsProtoBytes, byte[][] filterProtoBytes) {
        +  public PartitionInfo[] listPartitions(byte[] optionsBytes, byte[][] filterProtoBytes) {
             // The example cannot prune its single partition, but a real bridge would inspect the
             // pushed predicates here and drop partitions that cannot match.
             System.out.println(
                 "ExampleBridgeProviderFactory.listPartitions received "
                     + filterProtoBytes.length
                     + " pushed filter(s)");
        -    return listPartitions(optionsProtoBytes);
        +    return listPartitions(optionsBytes);
           }
         
           @Override
        -  public boolean sharedScan(byte[] optionsProtoBytes) {
        +  public boolean sharedScan(byte[] optionsBytes) {
             // The flag is the final byte of the options blob (present only when the encoder wrote the
             // trailing fields). The bridge owns its wire format, so decoding it here is fair game.
        -    return optionsProtoBytes != null
        -        && optionsProtoBytes.length >= 1
        -        && hasTrailingFields(optionsProtoBytes)
        -        && optionsProtoBytes[optionsProtoBytes.length - 1] == 1;
        +    return optionsBytes != null
        +        && optionsBytes.length >= 1
        +        && hasTrailingFields(optionsBytes)
        +        && optionsBytes[optionsBytes.length - 1] == 1;
           }
         
           private static boolean hasTrailingFields(byte[] bytes) {
        diff --git a/spark/README.md b/spark/README.md
        index 17147d6..941c427 100644
        --- a/spark/README.md
        +++ b/spark/README.md
        @@ -149,8 +149,8 @@ into more than one Spark task:
         
         ```java
             @Override
        -    public PartitionInfo[] listPartitions(byte[] optionsProtoBytes) {
        -        MySlice[] slices = MyBridgeNative.listSlices(optionsProtoBytes);
        +    public PartitionInfo[] listPartitions(byte[] optionsBytes) {
        +        MySlice[] slices = MyBridgeNative.listSlices(optionsBytes);
                 PartitionInfo[] out = new PartitionInfo[slices.length];
                 for (int i = 0; i < slices.length; i++) {
                     out[i] = new PartitionInfo(slices[i].id(), slices[i].payload(), slices[i].hosts());
        @@ -333,7 +333,7 @@ provider builds dominate. Opting in via
         
         ```java
         @Override
        -public boolean sharedScan(byte[] optionsProtoBytes) { return true; }
        +public boolean sharedScan(byte[] optionsBytes) { return true; }
         ```
         
         flips the mapping: the provider is built **once per executor JVM per query**
        @@ -356,7 +356,7 @@ Choosing between the modes:
         
         Shared-scan's price of admission is a **determinism contract**: the
         provider's schema, partitioning, and per-partition contents must be a pure
        -function of `optionsProtoBytes`. Remote sources must pin a snapshot
        +function of `optionsBytes`. Remote sources must pin a snapshot
         (version/timestamp) inside the options. The connector fails tasks when an
         executor's partition count diverges from the driver's, but equal counts with
         different contents are undetectable by construction. The provider's
        diff --git a/spark/src/main/java/io/datafusion/spark/BridgeProviderFactory.java b/spark/src/main/java/io/datafusion/spark/BridgeProviderFactory.java
        index 1231d35..3bcf7ad 100644
        --- a/spark/src/main/java/io/datafusion/spark/BridgeProviderFactory.java
        +++ b/spark/src/main/java/io/datafusion/spark/BridgeProviderFactory.java
        @@ -79,7 +79,7 @@ default byte[] encodeOptions(Map sparkOptions) {
            * #sharedScan(byte[])}) before pointing it at anything large. Size guidance lives in {@code
            * spark/README.md}.
            */
        -  default PartitionInfo[] listPartitions(byte[] optionsProtoBytes) {
        +  default PartitionInfo[] listPartitions(byte[] optionsBytes) {
             return new PartitionInfo[] {new PartitionInfo("p0", new byte[0], new String[0])};
           }
         
        @@ -95,8 +95,8 @@ default PartitionInfo[] listPartitions(byte[] optionsProtoBytes) {
            * conjunction of all pushed predicates. The default delegates to the filter-unaware overload (no
            * pruning), which is always correct.
            */
        -  default PartitionInfo[] listPartitions(byte[] optionsProtoBytes, byte[][] filterProtoBytes) {
        -    return listPartitions(optionsProtoBytes);
        +  default PartitionInfo[] listPartitions(byte[] optionsBytes, byte[][] filterProtoBytes) {
        +    return listPartitions(optionsBytes);
           }
         
           /**
        @@ -118,7 +118,7 @@ default PartitionInfo[] listPartitions(byte[] optionsProtoBytes, byte[][] filter
            *
            * 
          *
        • The provider's schema, partitioning, and per-partition row content are a pure function of - * {@code optionsProtoBytes}. Remote sources must pin a snapshot (version, timestamp) inside + * {@code optionsBytes}. Remote sources must pin a snapshot (version, timestamp) inside * the options; data that compacts or moves between driver planning and executor execution * otherwise yields wrong results that no runtime check can catch. *
        • The provider's {@code ExecutionPlan} supports calling {@code execute(i)} more than once @@ -129,7 +129,7 @@ default PartitionInfo[] listPartitions(byte[] optionsProtoBytes, byte[][] filter *

          The connector fails tasks with a clear error when the executor's partition count diverges * from the driver's — but identical counts with different contents cannot be detected. */ - default boolean sharedScan(byte[] optionsProtoBytes) { + default boolean sharedScan(byte[] optionsBytes) { return false; } @@ -154,7 +154,7 @@ default boolean sharedScan(byte[] optionsProtoBytes) { * KeyGroupedPartitioning} entirely. Storage-partitioned joins additionally require {@code * spark.sql.sources.v2.bucketing.enabled=true}. */ - default ReportedPartitioning reportPartitioning(byte[] optionsProtoBytes) { + default ReportedPartitioning reportPartitioning(byte[] optionsBytes) { return null; } } diff --git a/spark/src/main/java/io/datafusion/spark/PartitionInfo.java b/spark/src/main/java/io/datafusion/spark/PartitionInfo.java index 67376db..e6e061b 100644 --- a/spark/src/main/java/io/datafusion/spark/PartitionInfo.java +++ b/spark/src/main/java/io/datafusion/spark/PartitionInfo.java @@ -22,7 +22,7 @@ /** * Driver-side descriptor for a single partition produced by {@link * BridgeProviderFactory#listPartitions(byte[])}. Carries the bridge-specific slice payload that the - * executor passes back into {@link BridgeProviderFactory#createProvider(byte[], byte[])}, plus + * executor passes back into {@link ScanBackend#createScan}, plus * optional host hints for Spark's scheduler. * *

          Fields: @@ -32,8 +32,8 @@ * Surfaces in Spark UI, logs, and exception messages. Must be non-empty. *

        • {@code partitionBytes} — opaque per-partition payload. Bridge encodes whatever the executor * needs to materialise *this* slice (offsets, row ranges, sub-options, etc.). Combined with - * the global {@code optionsProtoBytes} in {@link BridgeProviderFactory#createProvider(byte[], - * byte[])}. Empty array = no per-partition state (single-partition table). + * the global {@code optionsBytes} in {@link ScanBackend#createScan}. Empty array = no + * per-partition state (single-partition table). *
        • {@code preferredLocations} — hostnames where this partition's data lives. Returned from * {@code InputPartition.preferredLocations()} so Spark can co-locate the task with the data. * Empty array = no preference. Honoured subject to {@code spark.locality.wait}. diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionBatch.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionBatch.scala index a97ecbe..90e829a 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionBatch.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionBatch.scala @@ -43,7 +43,7 @@ class DatafusionBatch(val scan: DatafusionScan) extends Batch { partitions.iterator.map { p => val base = DatafusionInputPartition( factoryFqcn = scan.factoryFqcn, - optionsProtoBytes = scan.optionsProtoBytes, + optionsBytes = scan.optionsBytes, projectionColumnNames = projection, filterProtoBytes = filterBytes, partitionId = p.id, @@ -63,7 +63,7 @@ class DatafusionBatch(val scan: DatafusionScan) extends Batch { Array.tabulate[InputPartition](numPartitions) { i => DatafusionSharedScanPartition( factoryFqcn = scan.factoryFqcn, - optionsProtoBytes = scan.optionsProtoBytes, + optionsBytes = scan.optionsBytes, projectionColumnNames = projection, filterProtoBytes = filterBytes, scanId = scanId, diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionColumnarPartitionReader.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionColumnarPartitionReader.scala index 678d258..e8b9d69 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionColumnarPartitionReader.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionColumnarPartitionReader.scala @@ -49,7 +49,7 @@ class DatafusionColumnarPartitionReader( private val scanHandle: Long = try { backend.createScan( - partition.optionsProtoBytes, + partition.optionsBytes, partition.partitionBytes, /* targetPartitions = */ -1, /* batchSize = */ -1, diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionInputPartition.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionInputPartition.scala index decd5c8..758b819 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionInputPartition.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionInputPartition.scala @@ -32,22 +32,22 @@ sealed trait DatafusionPartition extends InputPartition * Per-task payload for the per-partition payload (legacy) read path. * * - `factoryFqcn`: fully-qualified class name of the bridge's `BridgeProviderFactory`. The - * executor reflectively instantiates this and calls `createProvider(optionsProtoBytes, - * partitionBytes)`. - * - `optionsProtoBytes`: bridge-specific global connection options, encoded by the bridge. + * executor reflectively instantiates this and calls + * `scanBackend().createScan(optionsBytes, partitionBytes, …)`. + * - `optionsBytes`: bridge-specific global connection options, encoded by the bridge. * Opaque to connector-core. Same bytes ride along on every partition. * - `projectionColumnNames`: pruned column list (post-`pruneColumns`). * - `filterProtoBytes`: V2 `Predicate` → DataFusion `LogicalExprNode` proto bytes; each one is * applied natively via `ScanBackend.createScan`. * - `partitionId`: stable identifier (e.g. a segment or file id) — surfaces in Spark UI/logs/errors. * - `partitionBytes`: opaque per-partition payload from `PartitionInfo.partitionBytes`. Passed - * back into `createProvider` so the bridge materialises *this* slice. + * back into `ScanBackend.createScan` so the bridge materialises *this* slice. * - `preferredLocs`: hostnames where this partition's data lives; returned from * `preferredLocations()` so Spark schedules the task there subject to `spark.locality.wait`. */ final case class DatafusionInputPartition( factoryFqcn: String, - optionsProtoBytes: Array[Byte], + optionsBytes: Array[Byte], projectionColumnNames: Array[String], filterProtoBytes: Array[Array[Byte]], partitionId: String, @@ -93,7 +93,7 @@ final case class DatafusionKeyedInputPartition( */ final case class DatafusionSharedScanPartition( factoryFqcn: String, - optionsProtoBytes: Array[Byte], + optionsBytes: Array[Byte], projectionColumnNames: Array[String], filterProtoBytes: Array[Array[Byte]], scanId: String, @@ -107,7 +107,7 @@ final case class DatafusionSharedScanPartition( SharedScanSpec( scanId = scanId, factoryFqcn = factoryFqcn, - optionsProtoBytes = optionsProtoBytes, + optionsBytes = optionsBytes, projectionColumnNames = projectionColumnNames, filterProtoBytes = filterProtoBytes, pinnedConfig = pinnedConfig diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionScan.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionScan.scala index 4c8b47f..1bcda2d 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionScan.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionScan.scala @@ -69,7 +69,7 @@ final case class SharedScanMode( */ class DatafusionScan( val factoryFqcn: String, - val optionsProtoBytes: Array[Byte], + val optionsBytes: Array[Byte], val fullSchema: StructType, val prunedSchema: StructType, val pushedPredicates: Array[Predicate], diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionScanBuilder.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionScanBuilder.scala index 8272f6d..316fbb5 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionScanBuilder.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionScanBuilder.scala @@ -45,7 +45,7 @@ import org.apache.spark.sql.types.StructType */ class DatafusionScanBuilder( factoryFqcn: String, - optionsProtoBytes: Array[Byte], + optionsBytes: Array[Byte], fullSchema: StructType ) extends ScanBuilder with SupportsPushDownV2Filters @@ -86,11 +86,11 @@ class DatafusionScanBuilder( override def build(): Scan = { val factory = instantiateFactory(factoryFqcn) val mode: DatafusionScanMode = - if (factory.sharedScan(optionsProtoBytes)) buildSharedScanMode() + if (factory.sharedScan(optionsBytes)) buildSharedScanMode() else buildLegacyMode(factory) new DatafusionScan( factoryFqcn, - optionsProtoBytes, + optionsBytes, fullSchema, pruned, pushed, @@ -101,13 +101,13 @@ class DatafusionScanBuilder( private def buildLegacyMode(factory: BridgeProviderFactory): LegacyMode = { val partitions: Array[PartitionInfo] = - factory.listPartitions(optionsProtoBytes, pushedBytes) + factory.listPartitions(optionsBytes, pushedBytes) if (partitions == null || partitions.isEmpty) { throw new IllegalStateException( s"BridgeProviderFactory '$factoryFqcn' returned no partitions to scan" ) } - LegacyMode(partitions, factory.reportPartitioning(optionsProtoBytes)) + LegacyMode(partitions, factory.reportPartitioning(optionsBytes)) } /** @@ -125,7 +125,7 @@ class DatafusionScanBuilder( val probeSpec = SharedScanSpec( scanId = scanId, factoryFqcn = factoryFqcn, - optionsProtoBytes = optionsProtoBytes, + optionsBytes = optionsBytes, projectionColumnNames = pruned.fieldNames, filterProtoBytes = pushedBytes, pinnedConfig = pinned diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionSource.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionSource.scala index 09fd652..125b3a1 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionSource.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionSource.scala @@ -38,7 +38,7 @@ import org.apache.spark.sql.util.CaseInsensitiveStringMap * * Schema discovery happens driver-side inside the bridge's native scan backend * (`ScanBackend.providerSchemaIpc`), which widens the provider and returns its Arrow schema as - * IPC bytes. The same `optionsProtoBytes` (and the factory FQCN) is then carried verbatim through + * IPC bytes. The same `optionsBytes` (and the factory FQCN) is then carried verbatim through * `DatafusionInputPartition`, so each executor task repeats the same factory → backend pipeline * locally. */ diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionTable.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionTable.scala index 31a55d2..a0e8ec4 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionTable.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionTable.scala @@ -31,7 +31,7 @@ import org.apache.spark.sql.util.CaseInsensitiveStringMap */ class DatafusionTable( val factoryFqcn: String, - val optionsProtoBytes: Array[Byte], + val optionsBytes: Array[Byte], val sparkSchema: StructType ) extends Table with SupportsRead { @@ -47,5 +47,5 @@ class DatafusionTable( } override def newScanBuilder(scanOpts: CaseInsensitiveStringMap): ScanBuilder = - new DatafusionScanBuilder(factoryFqcn, optionsProtoBytes, sparkSchema) + new DatafusionScanBuilder(factoryFqcn, optionsBytes, sparkSchema) } diff --git a/spark/src/main/scala/io/datafusion/spark/NativeSharedScanResources.scala b/spark/src/main/scala/io/datafusion/spark/NativeSharedScanResources.scala index 77350a7..b541c8a 100644 --- a/spark/src/main/scala/io/datafusion/spark/NativeSharedScanResources.scala +++ b/spark/src/main/scala/io/datafusion/spark/NativeSharedScanResources.scala @@ -80,7 +80,7 @@ private[spark] object NativeSharedScanResources extends Logging { // Shared mode builds the dataset-wide provider: empty partitionBytes, like the // driver-side schema probe. DataFusion-native partitioning replaces listPartitions. val scanHandle = backend.createScan( - spec.optionsProtoBytes, + spec.optionsBytes, Array.emptyByteArray, spec.pinnedConfig.targetPartitions, spec.pinnedConfig.batchSize, diff --git a/spark/src/main/scala/io/datafusion/spark/SharedScanCache.scala b/spark/src/main/scala/io/datafusion/spark/SharedScanCache.scala index a092eb7..a134746 100644 --- a/spark/src/main/scala/io/datafusion/spark/SharedScanCache.scala +++ b/spark/src/main/scala/io/datafusion/spark/SharedScanCache.scala @@ -31,7 +31,7 @@ import org.apache.arrow.vector.ipc.ArrowReader final case class SharedScanSpec( scanId: String, factoryFqcn: String, - optionsProtoBytes: Array[Byte], + optionsBytes: Array[Byte], projectionColumnNames: Array[String], filterProtoBytes: Array[Array[Byte]], pinnedConfig: PinnedSessionConfig diff --git a/spark/src/main/scala/io/datafusion/spark/SharedScanPartitionReader.scala b/spark/src/main/scala/io/datafusion/spark/SharedScanPartitionReader.scala index f7bad10..4f0c9c1 100644 --- a/spark/src/main/scala/io/datafusion/spark/SharedScanPartitionReader.scala +++ b/spark/src/main/scala/io/datafusion/spark/SharedScanPartitionReader.scala @@ -47,7 +47,7 @@ class SharedScanPartitionReader( throw new IllegalStateException( s"shared-scan determinism violation for scanId=${partition.scanId}: driver planned " + s"${partition.numPartitions} partition(s) but this executor planned $executorCount. " + - "The provider's partitioning must be a pure function of optionsProtoBytes; pin your " + + "The provider's partitioning must be a pure function of optionsBytes; pin your " + "source snapshot (see BridgeProviderFactory.sharedScan).") } diff --git a/spark/src/test/scala/io/datafusion/spark/BridgeProviderFactoryDefaultsTest.scala b/spark/src/test/scala/io/datafusion/spark/BridgeProviderFactoryDefaultsTest.scala index 79f00cb..0b94eee 100644 --- a/spark/src/test/scala/io/datafusion/spark/BridgeProviderFactoryDefaultsTest.scala +++ b/spark/src/test/scala/io/datafusion/spark/BridgeProviderFactoryDefaultsTest.scala @@ -50,8 +50,8 @@ class BridgeProviderFactoryDefaultsTest extends AnyFunSuite { override def scanBackend(): ScanBackend = StubBackend - override def listPartitions(optionsProtoBytes: Array[Byte]): Array[PartitionInfo] = { - lastListPartitionsOpts = optionsProtoBytes + override def listPartitions(optionsBytes: Array[Byte]): Array[PartitionInfo] = { + lastListPartitionsOpts = optionsBytes Array(new PartitionInfo("p0", Array.emptyByteArray, Array.empty[String])) } } diff --git a/spark/src/test/scala/io/datafusion/spark/SharedScanCacheTest.scala b/spark/src/test/scala/io/datafusion/spark/SharedScanCacheTest.scala index 08acc97..dae49eb 100644 --- a/spark/src/test/scala/io/datafusion/spark/SharedScanCacheTest.scala +++ b/spark/src/test/scala/io/datafusion/spark/SharedScanCacheTest.scala @@ -32,7 +32,7 @@ class SharedScanCacheTest extends AnyFunSuite { SharedScanSpec( scanId = scanId, factoryFqcn = "test.Factory", - optionsProtoBytes = Array.emptyByteArray, + optionsBytes = Array.emptyByteArray, projectionColumnNames = Array.empty, filterProtoBytes = Array.empty, pinnedConfig = PinnedSessionConfig(8, 8192, Vector.empty) From 4b997345099a39025acf07269455109628370664 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 11 Jun 2026 20:54:20 +0200 Subject: [PATCH 20/22] refactor(spark): rename LegacyMode to PerPartitionMode The "legacy" naming implied a deprecated path, but this is all new, unreleased code with no prior path. Rename the scan mode and scrub "legacy" wording from comments to describe what the mode actually does. Co-Authored-By: Claude Opus 4.8 (1M context) --- spark/bridge/src/scan.rs | 7 +++---- spark/src/main/java/io/datafusion/spark/ScanBackend.java | 2 +- .../main/scala/io/datafusion/spark/DatafusionBatch.scala | 4 ++-- .../spark/DatafusionColumnarPartitionReader.scala | 2 +- .../io/datafusion/spark/DatafusionInputPartition.scala | 4 ++-- .../main/scala/io/datafusion/spark/DatafusionScan.scala | 8 ++++---- .../scala/io/datafusion/spark/DatafusionScanBuilder.scala | 6 +++--- 7 files changed, 16 insertions(+), 17 deletions(-) diff --git a/spark/bridge/src/scan.rs b/spark/bridge/src/scan.rs index 8d67b14..ad27dff 100644 --- a/spark/bridge/src/scan.rs +++ b/spark/bridge/src/scan.rs @@ -38,9 +38,8 @@ //! succeeds when every operator in that partition's pipeline supports //! repeated `execute()` — stateless scans do, `RepartitionExec` //! pipelines do not; -//! - [`execute_stream`] — the whole plan as one stream (legacy -//! per-partition payload mode, where the provider itself is the task's -//! slice); +//! - [`execute_stream`] — the whole plan as one stream (per-partition +//! mode, where the provider itself is the task's slice); //! - [`close_scan`] — drop the plan. The single unsafe interleaving is //! closing a handle that still has an in-flight call; the Java consumer //! (the shared-scan cache) prevents it with a refcount covering every @@ -282,7 +281,7 @@ pub fn execute_stream_partition( }) } -/// Whole-plan stream for legacy per-partition payload mode (the provider +/// Whole-plan stream for per-partition mode (the provider /// itself is the task's slice, so all plan partitions merge into one reader). pub fn execute_stream(env: &mut JNIEnv, handle: jlong, ffi_stream_addr: jlong) { try_unwrap_or_throw(env, (), |_env| -> JniResult<()> { diff --git a/spark/src/main/java/io/datafusion/spark/ScanBackend.java b/spark/src/main/java/io/datafusion/spark/ScanBackend.java index 63ef0b5..a994c98 100644 --- a/spark/src/main/java/io/datafusion/spark/ScanBackend.java +++ b/spark/src/main/java/io/datafusion/spark/ScanBackend.java @@ -70,7 +70,7 @@ long createScan( /** * Stream the WHOLE plan (all partitions coalesced) into the caller-allocated {@code - * FFI_ArrowArrayStream} at {@code ffiStreamAddr}. Used by legacy per-partition payload mode. + * FFI_ArrowArrayStream} at {@code ffiStreamAddr}. Used by per-partition mode. */ void executeStream(long scanHandle, long ffiStreamAddr); diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionBatch.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionBatch.scala index 90e829a..684e9fd 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionBatch.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionBatch.scala @@ -26,7 +26,7 @@ import org.apache.spark.sql.connector.read.{Batch, InputPartition, PartitionRead /** * Spark `Batch` for a DataFusion-backed scan. Driver-side partition planning: - * - [[LegacyMode]]: one task per `PartitionInfo` (resolved by [[DatafusionScanBuilder]]); when + * - [[PerPartitionMode]]: one task per `PartitionInfo` (resolved by [[DatafusionScanBuilder]]); when * the bridge reported a partitioning and every entry carries key values, tasks implement * `HasPartitionKey` so Spark can actually use the `KeyGroupedPartitioning`. * - [[SharedScanMode]]: one task per DataFusion plan partition index. @@ -38,7 +38,7 @@ class DatafusionBatch(val scan: DatafusionScan) extends Batch { val filterBytes: Array[Array[Byte]] = scan.pushedPredicateBytes scan.mode match { - case LegacyMode(partitions, reported) => + case PerPartitionMode(partitions, reported) => val keyed = DatafusionBatch.validateKeyedState(scan.factoryFqcn, partitions, reported) partitions.iterator.map { p => val base = DatafusionInputPartition( diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionColumnarPartitionReader.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionColumnarPartitionReader.scala index e8b9d69..96b7548 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionColumnarPartitionReader.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionColumnarPartitionReader.scala @@ -26,7 +26,7 @@ import org.apache.spark.sql.types.StructType import org.apache.spark.sql.vectorized.ColumnarBatch /** - * Per-task columnar reader for the per-partition payload (legacy) path. Lifecycle: + * Per-task columnar reader for the per-partition path. Lifecycle: * * 1. Reflectively instantiate the bridge's `BridgeProviderFactory` (no-arg) and take its * [[ScanBackend]]. diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionInputPartition.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionInputPartition.scala index 758b819..5255644 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionInputPartition.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionInputPartition.scala @@ -29,7 +29,7 @@ import org.apache.spark.sql.connector.read.{HasPartitionKey, InputPartition} sealed trait DatafusionPartition extends InputPartition /** - * Per-task payload for the per-partition payload (legacy) read path. + * Per-task payload for the per-partition read path. * * - `factoryFqcn`: fully-qualified class name of the bridge's `BridgeProviderFactory`. The * executor reflectively instantiates this and calls @@ -59,7 +59,7 @@ final case class DatafusionInputPartition( } /** - * Legacy-path payload that additionally carries this partition's key values, precomputed + * Per-partition payload that additionally carries this partition's key values, precomputed * driver-side into an [[InternalRow]]. Emitted by [[DatafusionBatch]] when the bridge reported a * partitioning AND every `PartitionInfo` carries `partitionKeyValues` — implementing * [[HasPartitionKey]] is what makes the reported `KeyGroupedPartitioning` visible to Spark 3.3+ diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionScan.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionScan.scala index 1bcda2d..38f0a8b 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionScan.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionScan.scala @@ -39,7 +39,7 @@ sealed trait DatafusionScanMode extends Serializable * from that entry's `partitionBytes`. `reported` is the bridge's optional partitioning * declaration (may be null). */ -final case class LegacyMode( +final case class PerPartitionMode( partitions: Array[PartitionInfo], reported: ReportedPartitioning ) extends DatafusionScanMode @@ -62,7 +62,7 @@ final case class SharedScanMode( * executor applies natively via `ScanBackend.createScan`, and the driver-resolved * [[DatafusionScanMode]]. * - * Legacy mode with a bridge-declared [[ReportedPartitioning]] surfaces `KeyGroupedPartitioning` + * Per-partition mode with a bridge-declared [[ReportedPartitioning]] surfaces `KeyGroupedPartitioning` * via `SupportsReportPartitioning`; note Spark 3.3+ only consumes it when the input partitions * also implement `HasPartitionKey` (see [[DatafusionBatch]]). Shared-scan mode always reports * `UnknownPartitioning` — DataFusion-native partitions carry no key contract. @@ -82,7 +82,7 @@ class DatafusionScan( override def description(): String = { val modeDesc = mode match { - case LegacyMode(partitions, reported) => + case PerPartitionMode(partitions, reported) => s"mode=per-partition, partitions=${partitions.length}," + s" reportedPartitioning=${if (reported == null) "unknown" else "key-grouped"}" case SharedScanMode(scanId, n, _, _) => @@ -95,7 +95,7 @@ class DatafusionScan( override def toBatch: Batch = new DatafusionBatch(this) override def outputPartitioning(): Partitioning = mode match { - case LegacyMode(partitions, reported) => + case PerPartitionMode(partitions, reported) => if (reported == null) new UnknownPartitioning(partitions.length) else new KeyGroupedPartitioning(reported.keys().toArray, partitions.length) case SharedScanMode(_, numPartitions, _, _) => diff --git a/spark/src/main/scala/io/datafusion/spark/DatafusionScanBuilder.scala b/spark/src/main/scala/io/datafusion/spark/DatafusionScanBuilder.scala index 316fbb5..a74029c 100644 --- a/spark/src/main/scala/io/datafusion/spark/DatafusionScanBuilder.scala +++ b/spark/src/main/scala/io/datafusion/spark/DatafusionScanBuilder.scala @@ -87,7 +87,7 @@ class DatafusionScanBuilder( val factory = instantiateFactory(factoryFqcn) val mode: DatafusionScanMode = if (factory.sharedScan(optionsBytes)) buildSharedScanMode() - else buildLegacyMode(factory) + else buildPerPartitionMode(factory) new DatafusionScan( factoryFqcn, optionsBytes, @@ -99,7 +99,7 @@ class DatafusionScanBuilder( ) } - private def buildLegacyMode(factory: BridgeProviderFactory): LegacyMode = { + private def buildPerPartitionMode(factory: BridgeProviderFactory): PerPartitionMode = { val partitions: Array[PartitionInfo] = factory.listPartitions(optionsBytes, pushedBytes) if (partitions == null || partitions.isEmpty) { @@ -107,7 +107,7 @@ class DatafusionScanBuilder( s"BridgeProviderFactory '$factoryFqcn' returned no partitions to scan" ) } - LegacyMode(partitions, factory.reportPartitioning(optionsBytes)) + PerPartitionMode(partitions, factory.reportPartitioning(optionsBytes)) } /** From 827068c8c1c7038b44414e63c93417b73f488ff7 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 12 Jun 2026 09:25:41 +0200 Subject: [PATCH 21/22] refactor(spark): move bridge scaffold from dev/ to spark/scaffold/ The bridge generator and template are user-facing tooling, not maintainer tooling like the rest of dev/ (release, changelog). Co-locate with the spark/bridge SDK they wire to. Update the script's repo-root resolution (parents[1] -> parents[2] for the extra directory level) and all path references in pom.xml, spark/README.md, and the template README. Co-Authored-By: Claude Opus 4.8 (1M context) --- pom.xml | 4 ++-- spark/README.md | 2 +- {dev => spark/scaffold}/bridge-template/.gitignore | 0 {dev => spark/scaffold}/bridge-template/README.md | 2 +- {dev => spark/scaffold}/bridge-template/native/Cargo.toml | 0 {dev => spark/scaffold}/bridge-template/native/src/lib.rs | 0 {dev => spark/scaffold}/bridge-template/pom.xml | 0 {dev => spark/scaffold}/bridge-template/smoke_test.py | 0 .../src/main/java/__PKG_PATH__/BridgeNative.java | 0 .../src/main/java/__PKG_PATH__/__PREFIX__DataSource.java | 0 .../main/java/__PKG_PATH__/__PREFIX__ProviderFactory.java | 0 .../src/main/java/__PKG_PATH__/__PREFIX__ScanBackend.java | 0 .../org.apache.spark.sql.sources.DataSourceRegister | 0 {dev => spark/scaffold}/new_bridge.py | 6 +++--- 14 files changed, 7 insertions(+), 7 deletions(-) rename {dev => spark/scaffold}/bridge-template/.gitignore (100%) rename {dev => spark/scaffold}/bridge-template/README.md (96%) rename {dev => spark/scaffold}/bridge-template/native/Cargo.toml (100%) rename {dev => spark/scaffold}/bridge-template/native/src/lib.rs (100%) rename {dev => spark/scaffold}/bridge-template/pom.xml (100%) rename {dev => spark/scaffold}/bridge-template/smoke_test.py (100%) rename {dev => spark/scaffold}/bridge-template/src/main/java/__PKG_PATH__/BridgeNative.java (100%) rename {dev => spark/scaffold}/bridge-template/src/main/java/__PKG_PATH__/__PREFIX__DataSource.java (100%) rename {dev => spark/scaffold}/bridge-template/src/main/java/__PKG_PATH__/__PREFIX__ProviderFactory.java (100%) rename {dev => spark/scaffold}/bridge-template/src/main/java/__PKG_PATH__/__PREFIX__ScanBackend.java (100%) rename {dev => spark/scaffold}/bridge-template/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister (100%) rename {dev => spark/scaffold}/new_bridge.py (96%) diff --git a/pom.xml b/pom.xml index 8bf1558..6baeb94 100644 --- a/pom.xml +++ b/pom.xml @@ -189,8 +189,8 @@ under the License. dev/release/rat_exclude_files.txt - dev/bridge-template/** + spark/scaffold/new_bridge.py, which must not impose ASF headers on them --> + spark/scaffold/bridge-template/** diff --git a/spark/README.md b/spark/README.md index 941c427..5cc3d3c 100644 --- a/spark/README.md +++ b/spark/README.md @@ -45,7 +45,7 @@ concrete provider is linked into the same cdylib as the scan machinery. Don't hand-assemble the pieces below — stamp them out: ```bash -python3 dev/new_bridge.py --name acme --package com.example.acme +python3 spark/scaffold/new_bridge.py --name acme --package com.example.acme ``` generates a standalone project (Rust cdylib with a working demo provider, diff --git a/dev/bridge-template/.gitignore b/spark/scaffold/bridge-template/.gitignore similarity index 100% rename from dev/bridge-template/.gitignore rename to spark/scaffold/bridge-template/.gitignore diff --git a/dev/bridge-template/README.md b/spark/scaffold/bridge-template/README.md similarity index 96% rename from dev/bridge-template/README.md rename to spark/scaffold/bridge-template/README.md index ff3e37f..8259e53 100644 --- a/dev/bridge-template/README.md +++ b/spark/scaffold/bridge-template/README.md @@ -2,7 +2,7 @@ A Spark DataSource V2 connector for the `__FORMAT__` format, built on the [datafusion-java Spark connector](https://github.com/apache/datafusion-java) -and its `datafusion-spark-bridge` Rust SDK. Generated by `dev/new_bridge.py`; +and its `datafusion-spark-bridge` Rust SDK. Generated by `spark/scaffold/new_bridge.py`; the only code you need to touch is marked `TODO`. ## What's here diff --git a/dev/bridge-template/native/Cargo.toml b/spark/scaffold/bridge-template/native/Cargo.toml similarity index 100% rename from dev/bridge-template/native/Cargo.toml rename to spark/scaffold/bridge-template/native/Cargo.toml diff --git a/dev/bridge-template/native/src/lib.rs b/spark/scaffold/bridge-template/native/src/lib.rs similarity index 100% rename from dev/bridge-template/native/src/lib.rs rename to spark/scaffold/bridge-template/native/src/lib.rs diff --git a/dev/bridge-template/pom.xml b/spark/scaffold/bridge-template/pom.xml similarity index 100% rename from dev/bridge-template/pom.xml rename to spark/scaffold/bridge-template/pom.xml diff --git a/dev/bridge-template/smoke_test.py b/spark/scaffold/bridge-template/smoke_test.py similarity index 100% rename from dev/bridge-template/smoke_test.py rename to spark/scaffold/bridge-template/smoke_test.py diff --git a/dev/bridge-template/src/main/java/__PKG_PATH__/BridgeNative.java b/spark/scaffold/bridge-template/src/main/java/__PKG_PATH__/BridgeNative.java similarity index 100% rename from dev/bridge-template/src/main/java/__PKG_PATH__/BridgeNative.java rename to spark/scaffold/bridge-template/src/main/java/__PKG_PATH__/BridgeNative.java diff --git a/dev/bridge-template/src/main/java/__PKG_PATH__/__PREFIX__DataSource.java b/spark/scaffold/bridge-template/src/main/java/__PKG_PATH__/__PREFIX__DataSource.java similarity index 100% rename from dev/bridge-template/src/main/java/__PKG_PATH__/__PREFIX__DataSource.java rename to spark/scaffold/bridge-template/src/main/java/__PKG_PATH__/__PREFIX__DataSource.java diff --git a/dev/bridge-template/src/main/java/__PKG_PATH__/__PREFIX__ProviderFactory.java b/spark/scaffold/bridge-template/src/main/java/__PKG_PATH__/__PREFIX__ProviderFactory.java similarity index 100% rename from dev/bridge-template/src/main/java/__PKG_PATH__/__PREFIX__ProviderFactory.java rename to spark/scaffold/bridge-template/src/main/java/__PKG_PATH__/__PREFIX__ProviderFactory.java diff --git a/dev/bridge-template/src/main/java/__PKG_PATH__/__PREFIX__ScanBackend.java b/spark/scaffold/bridge-template/src/main/java/__PKG_PATH__/__PREFIX__ScanBackend.java similarity index 100% rename from dev/bridge-template/src/main/java/__PKG_PATH__/__PREFIX__ScanBackend.java rename to spark/scaffold/bridge-template/src/main/java/__PKG_PATH__/__PREFIX__ScanBackend.java diff --git a/dev/bridge-template/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/spark/scaffold/bridge-template/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister similarity index 100% rename from dev/bridge-template/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister rename to spark/scaffold/bridge-template/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister diff --git a/dev/new_bridge.py b/spark/scaffold/new_bridge.py similarity index 96% rename from dev/new_bridge.py rename to spark/scaffold/new_bridge.py index 6e0b87a..03b8de7 100644 --- a/dev/new_bridge.py +++ b/spark/scaffold/new_bridge.py @@ -16,7 +16,7 @@ # specific language governing permissions and limitations # under the License. -"""Scaffold a new Spark bridge project from dev/bridge-template/. +"""Scaffold a new Spark bridge project from spark/scaffold/bridge-template/. Stamps out a standalone project (Maven + Cargo) wired to the datafusion-spark-bridge SDK: a Rust cdylib with `export_bridge!` and a demo @@ -26,7 +26,7 @@ build/run commands. Usage: - python3 dev/new_bridge.py --name acme --package com.example.acme \ + python3 spark/scaffold/new_bridge.py --name acme --package com.example.acme \ [--output DIR] [--datafusion-java REPO_ROOT] `--name` is the Spark format short name (spark.read.format("acme")); it also @@ -79,7 +79,7 @@ def main() -> None: prefix = class_prefix(args.name) crate = args.name.replace("_", "-") + "-spark-bridge" lib = args.name + "_spark_bridge" - repo = Path(args.datafusion_java).resolve() if args.datafusion_java else TEMPLATE_DIR.parents[1] + repo = Path(args.datafusion_java).resolve() if args.datafusion_java else TEMPLATE_DIR.parents[2] sdk_path = repo / "spark" / "bridge" if not (sdk_path / "Cargo.toml").is_file(): sys.exit(f"datafusion-spark-bridge crate not found at {sdk_path}") From 0cc474a3bddaf059e4f18319a74c17ed6e649a13 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 12 Jun 2026 10:27:28 +0200 Subject: [PATCH 22/22] add support for fixed sized list widening --- spark/bridge/src/widening.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/spark/bridge/src/widening.rs b/spark/bridge/src/widening.rs index 8879507..86c4abf 100644 --- a/spark/bridge/src/widening.rs +++ b/spark/bridge/src/widening.rs @@ -67,8 +67,16 @@ pub fn arrow_cast_widening(dt: &DataType) -> Option { .map(|inner| DataType::List(widened_child(field, inner))), DataType::LargeList(field) => arrow_cast_widening(field.data_type()) .map(|inner| DataType::LargeList(widened_child(field, inner))), - DataType::FixedSizeList(field, size) => arrow_cast_widening(field.data_type()) - .map(|inner| DataType::FixedSizeList(widened_child(field, inner), *size)), + // Spark 3.5's ArrowColumnVector cannot read FixedSizeList at all, so + // always convert it to a (variable) List — which Spark maps to + // ArrayType — widening the child element type when needed too. + DataType::FixedSizeList(field, _size) => { + let child = match arrow_cast_widening(field.data_type()) { + Some(inner) => widened_child(field, inner), + None => Arc::clone(field), + }; + Some(DataType::List(child)) + } _ => None, } }