diff --git a/.github/workflows/verify-package.yml b/.github/workflows/verify-package.yml index 91744299..085ac61d 100644 --- a/.github/workflows/verify-package.yml +++ b/.github/workflows/verify-package.yml @@ -49,6 +49,7 @@ jobs: sudo apt-get install -y --no-install-recommends \ cmake \ libssl-dev \ + libcurl4-openssl-dev \ pkg-config \ libsasl2-dev \ protobuf-compiler diff --git a/Cargo.lock b/Cargo.lock index 26f07400..e174c43f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11,6 +11,12 @@ dependencies = [ "gimli", ] +[[package]] +name = "adler2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" + [[package]] name = "ahash" version = "0.8.12" @@ -34,6 +40,21 @@ dependencies = [ "memchr", ] +[[package]] +name = "alloc-no-stdlib" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3" + +[[package]] +name = "alloc-stdlib" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece" +dependencies = [ + "alloc-no-stdlib", +] + [[package]] name = "allocator-api2" version = "0.2.21" @@ -57,9 +78,9 @@ dependencies = [ [[package]] name = "anstream" -version = "0.6.21" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" +checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" dependencies = [ "anstyle", "anstyle-parse", @@ -72,15 +93,15 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.13" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" [[package]] name = "anstyle-parse" -version = "0.2.7" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" +checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" dependencies = [ "utf8parse", ] @@ -107,9 +128,18 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.100" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" + +[[package]] +name = "ar_archive_writer" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" +checksum = "7eb93bbb63b9c227414f6eb3a0adfddca591a8ce1e9b60661bb08969b87e340b" +dependencies = [ + "object", +] [[package]] name = "arbitrary" @@ -117,6 +147,53 @@ version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1" +[[package]] +name = "arrayref" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" + +[[package]] +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + +[[package]] +name = "arrow" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3f15b4c6b148206ff3a2b35002e08929c2462467b62b9c02036d9c34f9ef994" +dependencies = [ + "arrow-arith", + "arrow-array 55.2.0", + "arrow-buffer 55.2.0", + "arrow-cast 55.2.0", + "arrow-csv", + "arrow-data 55.2.0", + "arrow-ipc 55.2.0", + "arrow-json", + "arrow-ord", + "arrow-row", + "arrow-schema 55.2.0", + "arrow-select 55.2.0", + "arrow-string", +] + +[[package]] +name = "arrow-arith" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30feb679425110209ae35c3fbf82404a39a4c0436bb3ec36164d8bffed2a4ce4" +dependencies = [ + "arrow-array 55.2.0", + "arrow-buffer 55.2.0", + "arrow-data 55.2.0", + "arrow-schema 55.2.0", + "chrono", + "num", +] + [[package]] name = "arrow-array" version = "52.2.0" @@ -124,15 +201,32 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "16f4a9468c882dc66862cef4e1fd8423d47e67972377d85d80e022786427768c" dependencies = [ "ahash", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-buffer 52.2.0", + "arrow-data 52.2.0", + "arrow-schema 52.2.0", "chrono", "half", "hashbrown 0.14.5", "num", ] +[[package]] +name = "arrow-array" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70732f04d285d49054a48b72c54f791bb3424abae92d27aafdf776c98af161c8" +dependencies = [ + "ahash", + "arrow-buffer 55.2.0", + "arrow-data 55.2.0", + "arrow-schema 55.2.0", + "chrono", + "chrono-tz", + "half", + "hashbrown 0.15.5", + "num", +] + [[package]] name = "arrow-buffer" version = "52.2.0" @@ -144,34 +238,93 @@ dependencies = [ "num", ] +[[package]] +name = "arrow-buffer" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "169b1d5d6cb390dd92ce582b06b23815c7953e9dfaaea75556e89d890d19993d" +dependencies = [ + "bytes", + "half", + "num", +] + [[package]] name = "arrow-cast" version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da26719e76b81d8bc3faad1d4dbdc1bcc10d14704e63dc17fc9f3e7e1e567c8e" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-array 52.2.0", + "arrow-buffer 52.2.0", + "arrow-data 52.2.0", + "arrow-schema 52.2.0", + "arrow-select 52.2.0", + "atoi", + "base64", + "chrono", + "half", + "lexical-core 0.8.5", + "num", + "ryu", +] + +[[package]] +name = "arrow-cast" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4f12eccc3e1c05a766cafb31f6a60a46c2f8efec9b74c6e0648766d30686af8" +dependencies = [ + "arrow-array 55.2.0", + "arrow-buffer 55.2.0", + "arrow-data 55.2.0", + "arrow-schema 55.2.0", + "arrow-select 55.2.0", "atoi", "base64", "chrono", + "comfy-table", "half", - "lexical-core", + "lexical-core 1.0.6", "num", "ryu", ] +[[package]] +name = "arrow-csv" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "012c9fef3f4a11573b2c74aec53712ff9fdae4a95f4ce452d1bbf088ee00f06b" +dependencies = [ + "arrow-array 55.2.0", + "arrow-cast 55.2.0", + "arrow-schema 55.2.0", + "chrono", + "csv", + "csv-core", + "regex", +] + [[package]] name = "arrow-data" version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd9d6f18c65ef7a2573ab498c374d8ae364b4a4edf67105357491c031f716ca5" dependencies = [ - "arrow-buffer", - "arrow-schema", + "arrow-buffer 52.2.0", + "arrow-schema 52.2.0", + "half", + "num", +] + +[[package]] +name = "arrow-data" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8de1ce212d803199684b658fc4ba55fb2d7e87b213de5af415308d2fee3619c2" +dependencies = [ + "arrow-buffer 55.2.0", + "arrow-schema 55.2.0", "half", "num", ] @@ -182,12 +335,74 @@ version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e786e1cdd952205d9a8afc69397b317cfbb6e0095e445c69cda7e8da5c1eeb0f" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-schema", - "flatbuffers", + "arrow-array 52.2.0", + "arrow-buffer 52.2.0", + "arrow-cast 52.2.0", + "arrow-data 52.2.0", + "arrow-schema 52.2.0", + "flatbuffers 24.12.23", +] + +[[package]] +name = "arrow-ipc" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9ea5967e8b2af39aff5d9de2197df16e305f47f404781d3230b2dc672da5d92" +dependencies = [ + "arrow-array 55.2.0", + "arrow-buffer 55.2.0", + "arrow-data 55.2.0", + "arrow-schema 55.2.0", + "flatbuffers 25.12.19", + "lz4_flex", +] + +[[package]] +name = "arrow-json" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5709d974c4ea5be96d900c01576c7c0b99705f4a3eec343648cb1ca863988a9c" +dependencies = [ + "arrow-array 55.2.0", + "arrow-buffer 55.2.0", + "arrow-cast 55.2.0", + "arrow-data 55.2.0", + "arrow-schema 55.2.0", + "chrono", + "half", + "indexmap 2.13.1", + "lexical-core 1.0.6", + "memchr", + "num", + "serde", + "serde_json", + "simdutf8", +] + +[[package]] +name = "arrow-ord" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6506e3a059e3be23023f587f79c82ef0bcf6d293587e3272d20f2d30b969b5a7" +dependencies = [ + "arrow-array 55.2.0", + "arrow-buffer 55.2.0", + "arrow-data 55.2.0", + "arrow-schema 55.2.0", + "arrow-select 55.2.0", +] + +[[package]] +name = "arrow-row" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52bf7393166beaf79b4bed9bfdf19e97472af32ce5b6b48169d321518a08cae2" +dependencies = [ + "arrow-array 55.2.0", + "arrow-buffer 55.2.0", + "arrow-data 55.2.0", + "arrow-schema 55.2.0", + "half", ] [[package]] @@ -196,6 +411,16 @@ version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e972cd1ff4a4ccd22f86d3e53e835c2ed92e0eea6a3e8eadb72b4f1ac802cf8" +[[package]] +name = "arrow-schema" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af7686986a3bf2254c9fb130c623cdcb2f8e1f15763e7c71c310f0834da3d292" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "arrow-select" version = "52.2.0" @@ -203,11 +428,59 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "600bae05d43483d216fb3494f8c32fdbefd8aa4e1de237e790dbb3d9f44690a3" dependencies = [ "ahash", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-array 52.2.0", + "arrow-buffer 52.2.0", + "arrow-data 52.2.0", + "arrow-schema 52.2.0", + "num", +] + +[[package]] +name = "arrow-select" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd2b45757d6a2373faa3352d02ff5b54b098f5e21dccebc45a21806bc34501e5" +dependencies = [ + "ahash", + "arrow-array 55.2.0", + "arrow-buffer 55.2.0", + "arrow-data 55.2.0", + "arrow-schema 55.2.0", + "num", +] + +[[package]] +name = "arrow-string" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0377d532850babb4d927a06294314b316e23311503ed580ec6ce6a0158f49d40" +dependencies = [ + "arrow-array 55.2.0", + "arrow-buffer 55.2.0", + "arrow-data 55.2.0", + "arrow-schema 55.2.0", + "arrow-select 55.2.0", + "memchr", "num", + "regex", + "regex-syntax", +] + +[[package]] +name = "async-compression" +version = "0.4.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06575e6a9673580f52661c92107baabffbf41e2141373441cbcdc47cb733003c" +dependencies = [ + "bzip2", + "flate2", + "futures-core", + "memchr", + "pin-project-lite", + "tokio", + "xz2", + "zstd", + "zstd-safe", ] [[package]] @@ -286,7 +559,7 @@ dependencies = [ "rustversion", "serde", "sync_wrapper", - "tower 0.5.2", + "tower 0.5.3", "tower-layer", "tower-service", ] @@ -317,13 +590,37 @@ version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "bigdecimal" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d6867f1565b3aad85681f1015055b087fcfd840d6aeee6eee7f2da317603695" +dependencies = [ + "autocfg", + "libm", + "num-bigint", + "num-integer", + "num-traits", +] + [[package]] name = "bincode" -version = "1.3.3" +version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" +checksum = "36eaf5d7b090263e8150820482d5d93cd964a81e4019913c972f4edcc6edb740" dependencies = [ + "bincode_derive", "serde", + "unty", +] + +[[package]] +name = "bincode_derive" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf95709a440f45e986983918d0e8a1f30a9b1df04918fc828670606804ac3c09" +dependencies = [ + "virtue", ] [[package]] @@ -353,14 +650,14 @@ version = "0.72.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895" dependencies = [ - "bitflags 2.10.0", + "bitflags 2.11.0", "cexpr", "clang-sys", "itertools 0.13.0", "proc-macro2", "quote", "regex", - "rustc-hash 2.1.1", + "rustc-hash 2.1.2", "shlex", "syn", ] @@ -373,9 +670,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.10.0" +version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" +checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" [[package]] name = "bitmaps" @@ -386,6 +683,29 @@ dependencies = [ "typenum", ] +[[package]] +name = "blake2" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46502ad458c9a52b69d4d4d32775c788b7a1b85e8bc9d482d92250fc0e3f8efe" +dependencies = [ + "digest", +] + +[[package]] +name = "blake3" +version = "1.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d2d5991425dfd0785aed03aedcf0b321d61975c9b5b3689c774a2610ae0b51e" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if", + "constant_time_eq", + "cpufeatures 0.3.0", +] + [[package]] name = "block-buffer" version = "0.10.4" @@ -395,20 +715,56 @@ dependencies = [ "generic-array", ] +[[package]] +name = "brotli" +version = "8.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bd8b9603c7aa97359dbd97ecf258968c95f3adddd6db2f7e7a5bef101c84560" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", + "brotli-decompressor", +] + +[[package]] +name = "brotli-decompressor" +version = "5.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "874bb8112abecc98cbd6d81ea4fa7e94fb9449648c93cc89aa40c81c24d7de03" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", +] + [[package]] name = "bumpalo" -version = "3.19.1" +version = "3.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dd9dc738b7a8311c7ade152424974d8115f2cdad61e8dab8dac9f2362298510" +checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" dependencies = [ "allocator-api2", ] +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "bytes" -version = "1.11.0" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" + +[[package]] +name = "bzip2" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b35204fbdc0b3f4446b89fc1ac2cf84a8a68971995d0bf2e925ec7cd960f9cb3" +checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47" +dependencies = [ + "bzip2-sys", +] [[package]] name = "bzip2-sys" @@ -440,7 +796,7 @@ checksum = "20a158160765c6a7d0d8c072a53d772e4cb243f38b04bfcf6b4939cfbe7482e7" dependencies = [ "cap-primitives", "cap-std", - "rustix 1.1.3", + "rustix 1.1.4", "smallvec", ] @@ -456,7 +812,7 @@ dependencies = [ "io-lifetimes", "ipnet", "maybe-owned", - "rustix 1.1.3", + "rustix 1.1.4", "rustix-linux-procfs", "windows-sys 0.59.0", "winx", @@ -469,7 +825,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d8144c22e24bbcf26ade86cb6501a0916c46b7e4787abdb0045a467eb1645a1d" dependencies = [ "ambient-authority", - "rand", + "rand 0.8.5", ] [[package]] @@ -481,7 +837,7 @@ dependencies = [ "cap-primitives", "io-extras", "io-lifetimes", - "rustix 1.1.3", + "rustix 1.1.4", ] [[package]] @@ -494,15 +850,15 @@ dependencies = [ "cap-primitives", "iana-time-zone", "once_cell", - "rustix 1.1.3", + "rustix 1.1.4", "winx", ] [[package]] name = "cc" -version = "1.2.51" +version = "1.2.59" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a0aeaff4ff1a90589618835a598e545176939b97874f7abc7851caa0618f203" +checksum = "b7a4d3ec6524d28a329fc53654bbadc9bdd7b0431f5d65f1a56ffb28a1ee5283" dependencies = [ "find-msvc-tools", "jobserver", @@ -533,15 +889,27 @@ checksum = "fd16c4719339c4530435d38e511904438d07cce7950afa3718a84ac36c10e89e" [[package]] name = "chrono" -version = "0.4.42" +version = "0.4.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "145052bdd345b87320e369255277e3fb5152762ad123a901ef5c262dd38fe8d2" +checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0" dependencies = [ "iana-time-zone", + "js-sys", "num-traits", + "wasm-bindgen", "windows-link", ] +[[package]] +name = "chrono-tz" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6139a8597ed92cf816dfb33f5dd6cf0bb93a6adc938f11039f371bc5bcd26c3" +dependencies = [ + "chrono", + "phf", +] + [[package]] name = "clang-sys" version = "1.8.1" @@ -555,9 +923,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.54" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6e6ff9dcd79cff5cd969a17a545d79e84ab086e444102a591e288a8aa3ce394" +checksum = "b193af5b67834b676abd72466a96c1024e6a6ad978a1f484bd90b85c94041351" dependencies = [ "clap_builder", "clap_derive", @@ -565,9 +933,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.54" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa42cf4d2b7a41bc8f663a7cab4031ebafa1bf3875705bfaf8466dc60ab52c00" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" dependencies = [ "anstream", "anstyle", @@ -577,9 +945,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.49" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671" +checksum = "1110bd8a634a1ab8cb04345d8d878267d57c3cf1b38d91b71af6686408bbca6a" dependencies = [ "heck", "proc-macro2", @@ -589,9 +957,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.7.6" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" [[package]] name = "clipboard-win" @@ -604,9 +972,9 @@ dependencies = [ [[package]] name = "cmake" -version = "0.1.57" +version = "0.1.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75443c44cd6b379beb8c5b45d85d0773baf31cce901fe7bb252f4eff3008ef7d" +checksum = "c0f78a02292a74a88ac736019ab962ece0bc380e3f977bf72e376c5d78ff0678" dependencies = [ "cc", ] @@ -617,14 +985,14 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fa961b519f0b462e3a3b4a34b64d119eeaca1d59af726fe450bbba07a9fc0a1" dependencies = [ - "thiserror 2.0.17", + "thiserror 2.0.18", ] [[package]] name = "colorchoice" -version = "1.0.4" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" +checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" [[package]] name = "comfy-table" @@ -652,11 +1020,17 @@ version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" dependencies = [ - "getrandom 0.2.16", + "getrandom 0.2.17", "once_cell", "tiny-keccak", ] +[[package]] +name = "constant_time_eq" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" + [[package]] name = "core-foundation-sys" version = "0.8.7" @@ -681,38 +1055,47 @@ dependencies = [ "libc", ] +[[package]] +name = "cpufeatures" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201" +dependencies = [ + "libc", +] + [[package]] name = "cranelift-assembler-x64" -version = "0.128.3" +version = "0.128.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0377b13bf002a0774fcccac4f1102a10f04893d24060cf4b7350c87e4cbb647c" +checksum = "50a04121a197fde2fe896f8e7cac9812fc41ed6ee9c63e1906090f9f497845f6" dependencies = [ "cranelift-assembler-x64-meta", ] [[package]] name = "cranelift-assembler-x64-meta" -version = "0.128.3" +version = "0.128.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfa027979140d023b25bf7509fb7ede3a54c3d3871fb5ead4673c4b633f671a2" +checksum = "a09e699a94f477303820fb2167024f091543d6240783a2d3b01a3f21c42bc744" dependencies = [ "cranelift-srcgen", ] [[package]] name = "cranelift-bforest" -version = "0.128.3" +version = "0.128.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "618e4da87d9179a70b3c2f664451ca8898987aa6eb9f487d16988588b5d8cc40" +checksum = "f07732c662a9755529e332d86f8c5842171f6e98ba4d5976a178043dad838654" dependencies = [ "cranelift-entity", ] [[package]] name = "cranelift-bitset" -version = "0.128.3" +version = "0.128.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db53764b5dad233b37b8f5dc54d3caa9900c54579195e00f17ea21f03f71aaa7" +checksum = "18391da761cf362a06def7a7cf11474d79e55801dd34c2e9ba105b33dc0aef88" dependencies = [ "serde", "serde_derive", @@ -720,9 +1103,9 @@ dependencies = [ [[package]] name = "cranelift-codegen" -version = "0.128.3" +version = "0.128.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ae927f1d8c0abddaa863acd201471d56e7fc6c3925104f4861ed4dc3e28b421" +checksum = "0b3a09b3042c69810d255aef59ddc3b3e4c0644d1d90ecfd6e3837798cc88a3c" dependencies = [ "bumpalo", "cranelift-assembler-x64", @@ -739,7 +1122,7 @@ dependencies = [ "postcard", "pulley-interpreter", "regalloc2", - "rustc-hash 2.1.1", + "rustc-hash 2.1.2", "serde", "serde_derive", "sha2", @@ -750,9 +1133,9 @@ dependencies = [ [[package]] name = "cranelift-codegen-meta" -version = "0.128.3" +version = "0.128.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3fcf1e3e6757834bd2584f4cbff023fcc198e9279dcb5d684b4bb27a9b19f54" +checksum = "75817926ec812241889208d1b190cadb7fedded4592a4bb01b8524babb9e4849" dependencies = [ "cranelift-assembler-x64-meta", "cranelift-codegen-shared", @@ -763,24 +1146,24 @@ dependencies = [ [[package]] name = "cranelift-codegen-shared" -version = "0.128.3" +version = "0.128.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "205dcb9e6ccf9d368b7466be675ff6ee54a63e36da6fe20e72d45169cf6fd254" +checksum = "859158f87a59476476eda3884d883c32e08a143cf3d315095533b362a3250a63" [[package]] name = "cranelift-control" -version = "0.128.3" +version = "0.128.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "108eca9fcfe86026054f931eceaf57b722c1b97464bf8265323a9b5877238817" +checksum = "03b65a9aec442d715cbf54d14548b8f395476c09cef7abe03e104a378291ab88" dependencies = [ "arbitrary", ] [[package]] name = "cranelift-entity" -version = "0.128.3" +version = "0.128.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0d96496910065d3165f84ff8e1e393916f4c086f88ac8e1b407678bc78735aa" +checksum = "8334c99a7e86060c24028732efd23bac84585770dcb752329c69f135d64f2fc1" dependencies = [ "cranelift-bitset", "serde", @@ -788,120 +1171,765 @@ dependencies = [ ] [[package]] -name = "cranelift-frontend" -version = "0.128.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e303983ad7e23c850f24d9c41fc3cb346e1b930f066d3966545e4c98dac5c9fb" +name = "cranelift-frontend" +version = "0.128.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43ac6c095aa5b3e845d7ca3461e67e2b65249eb5401477a5ff9100369b745111" +dependencies = [ + "cranelift-codegen", + "log", + "smallvec", + "target-lexicon", +] + +[[package]] +name = "cranelift-isle" +version = "0.128.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69d3d992870ed4f0f2e82e2175275cb3a123a46e9660c6558c46417b822c91fa" + +[[package]] +name = "cranelift-native" +version = "0.128.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee32e36beaf80f309edb535274cfe0349e1c5cf5799ba2d9f42e828285c6b52e" +dependencies = [ + "cranelift-codegen", + "libc", + "target-lexicon", +] + +[[package]] +name = "cranelift-srcgen" +version = "0.128.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "903adeaf4938e60209a97b53a2e4326cd2d356aab9764a1934630204bae381c9" + +[[package]] +name = "crc32fast" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crossterm" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8b9f2e4c67f833b660cdb0a3523065869fb35570177239812ed4c905aeff87b" +dependencies = [ + "bitflags 2.11.0", + "crossterm_winapi", + "document-features", + "parking_lot", + "rustix 1.1.4", + "winapi", +] + +[[package]] +name = "crossterm_winapi" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acdd7c62a3665c7f6830a51635d9ac9b23ed385797f70a83bb8bafe9c572ab2b" +dependencies = [ + "winapi", +] + +[[package]] +name = "crunchy" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" + +[[package]] +name = "crypto-common" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "csv" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde_core", +] + +[[package]] +name = "csv-core" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782" +dependencies = [ + "memchr", +] + +[[package]] +name = "curl-sys" +version = "0.4.87+curl-8.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61a460380f0ef783703dcbe909107f39c162adeac050d73c850055118b5b6327" +dependencies = [ + "cc", + "libc", + "libz-sys", + "openssl-sys", + "pkg-config", + "vcpkg", + "windows-sys 0.59.0", +] + +[[package]] +name = "dashmap" +version = "6.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core", +] + +[[package]] +name = "datafusion" +version = "48.0.1" +source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" +dependencies = [ + "arrow", + "arrow-ipc 55.2.0", + "arrow-schema 55.2.0", + "async-trait", + "bytes", + "bzip2", + "chrono", + "datafusion-catalog", + "datafusion-catalog-listing", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-datasource-csv", + "datafusion-datasource-json", + "datafusion-datasource-parquet", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-functions", + "datafusion-functions-aggregate", + "datafusion-functions-nested", + "datafusion-functions-table", + "datafusion-functions-window", + "datafusion-optimizer", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-optimizer", + "datafusion-physical-plan", + "datafusion-session", + "datafusion-sql", + "flate2", + "futures", + "itertools 0.14.0", + "log", + "object_store", + "parking_lot", + "parquet", + "rand 0.9.2", + "regex", + "sqlparser", + "tempfile", + "tokio", + "url", + "uuid", + "xz2", + "zstd", +] + +[[package]] +name = "datafusion-catalog" +version = "48.0.1" +source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" +dependencies = [ + "arrow", + "async-trait", + "dashmap", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-plan", + "datafusion-session", + "datafusion-sql", + "futures", + "itertools 0.14.0", + "log", + "object_store", + "parking_lot", + "tokio", +] + +[[package]] +name = "datafusion-catalog-listing" +version = "48.0.1" +source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" +dependencies = [ + "arrow", + "async-trait", + "datafusion-catalog", + "datafusion-common", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "log", + "object_store", + "tokio", +] + +[[package]] +name = "datafusion-common" +version = "48.0.1" +source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" +dependencies = [ + "ahash", + "arrow", + "arrow-ipc 55.2.0", + "base64", + "half", + "hashbrown 0.14.5", + "indexmap 2.13.1", + "libc", + "log", + "object_store", + "parquet", + "paste", + "recursive", + "sqlparser", + "tokio", + "web-time", +] + +[[package]] +name = "datafusion-common-runtime" +version = "48.0.1" +source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" +dependencies = [ + "futures", + "log", + "tokio", +] + +[[package]] +name = "datafusion-datasource" +version = "48.0.1" +source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" +dependencies = [ + "arrow", + "async-compression", + "async-trait", + "bytes", + "bzip2", + "chrono", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "flate2", + "futures", + "glob", + "itertools 0.14.0", + "log", + "object_store", + "parquet", + "rand 0.9.2", + "tempfile", + "tokio", + "tokio-util", + "url", + "xz2", + "zstd", +] + +[[package]] +name = "datafusion-datasource-csv" +version = "48.0.1" +source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-catalog", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "object_store", + "regex", + "tokio", +] + +[[package]] +name = "datafusion-datasource-json" +version = "48.0.1" +source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-catalog", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "object_store", + "serde_json", + "tokio", +] + +[[package]] +name = "datafusion-datasource-parquet" +version = "48.0.1" +source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-catalog", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-aggregate", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-optimizer", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "itertools 0.14.0", + "log", + "object_store", + "parking_lot", + "parquet", + "rand 0.9.2", + "tokio", +] + +[[package]] +name = "datafusion-doc" +version = "48.0.1" +source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" + +[[package]] +name = "datafusion-execution" +version = "48.0.1" +source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" +dependencies = [ + "arrow", + "dashmap", + "datafusion-common", + "datafusion-expr", + "futures", + "log", + "object_store", + "parking_lot", + "rand 0.9.2", + "tempfile", + "url", +] + +[[package]] +name = "datafusion-expr" +version = "48.0.1" +source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" +dependencies = [ + "arrow", + "chrono", + "datafusion-common", + "datafusion-doc", + "datafusion-expr-common", + "datafusion-functions-aggregate-common", + "datafusion-functions-window-common", + "datafusion-physical-expr-common", + "indexmap 2.13.1", + "paste", + "recursive", + "serde_json", + "sqlparser", +] + +[[package]] +name = "datafusion-expr-common" +version = "48.0.1" +source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" +dependencies = [ + "arrow", + "datafusion-common", + "indexmap 2.13.1", + "itertools 0.14.0", + "paste", +] + +[[package]] +name = "datafusion-functions" +version = "48.0.1" +source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" +dependencies = [ + "arrow", + "arrow-buffer 55.2.0", + "base64", + "blake2", + "blake3", + "chrono", + "datafusion-common", + "datafusion-doc", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-macros", + "hex", + "itertools 0.14.0", + "log", + "md-5", + "rand 0.9.2", + "regex", + "sha2", + "unicode-segmentation", + "uuid", +] + +[[package]] +name = "datafusion-functions-aggregate" +version = "48.0.1" +source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-doc", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-aggregate-common", + "datafusion-macros", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "half", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-aggregate-common" +version = "48.0.1" +source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-expr-common", + "datafusion-physical-expr-common", +] + +[[package]] +name = "datafusion-functions-nested" +version = "48.0.1" +source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" +dependencies = [ + "arrow", + "arrow-ord", + "datafusion-common", + "datafusion-doc", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions", + "datafusion-functions-aggregate", + "datafusion-macros", + "datafusion-physical-expr-common", + "itertools 0.14.0", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-table" +version = "48.0.1" +source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ - "cranelift-codegen", - "log", - "smallvec", - "target-lexicon", + "arrow", + "async-trait", + "datafusion-catalog", + "datafusion-common", + "datafusion-expr", + "datafusion-physical-plan", + "parking_lot", + "paste", ] [[package]] -name = "cranelift-isle" -version = "0.128.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24b0cf8d867d891245836cac7abafb0a5b0ea040a019d720702b3b8bcba40bfa" +name = "datafusion-functions-window" +version = "48.0.1" +source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-doc", + "datafusion-expr", + "datafusion-functions-window-common", + "datafusion-macros", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "log", + "paste", +] [[package]] -name = "cranelift-native" -version = "0.128.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e24b641e315443e27807b69c440fe766737d7e718c68beb665a2d69259c77bf3" +name = "datafusion-functions-window-common" +version = "48.0.1" +source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ - "cranelift-codegen", - "libc", - "target-lexicon", + "datafusion-common", + "datafusion-physical-expr-common", ] [[package]] -name = "cranelift-srcgen" -version = "0.128.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4e378a54e7168a689486d67ee1f818b7e5356e54ae51a1d7a53f4f13f7f8b7a" +name = "datafusion-macros" +version = "48.0.1" +source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" +dependencies = [ + "datafusion-expr", + "quote", + "syn", +] [[package]] -name = "crc32fast" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +name = "datafusion-optimizer" +version = "48.0.1" +source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ - "cfg-if", + "arrow", + "chrono", + "datafusion-common", + "datafusion-expr", + "datafusion-physical-expr", + "indexmap 2.13.1", + "itertools 0.14.0", + "log", + "recursive", + "regex", + "regex-syntax", ] [[package]] -name = "crossbeam-channel" -version = "0.5.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2" +name = "datafusion-physical-expr" +version = "48.0.1" +source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ - "crossbeam-utils", + "ahash", + "arrow", + "datafusion-common", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-functions-aggregate-common", + "datafusion-physical-expr-common", + "half", + "hashbrown 0.14.5", + "indexmap 2.13.1", + "itertools 0.14.0", + "log", + "paste", + "petgraph 0.8.3", ] [[package]] -name = "crossbeam-deque" -version = "0.8.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +name = "datafusion-physical-expr-common" +version = "48.0.1" +source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ - "crossbeam-epoch", - "crossbeam-utils", + "ahash", + "arrow", + "datafusion-common", + "datafusion-expr-common", + "hashbrown 0.14.5", + "itertools 0.14.0", ] [[package]] -name = "crossbeam-epoch" -version = "0.9.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +name = "datafusion-physical-optimizer" +version = "48.0.1" +source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ - "crossbeam-utils", + "arrow", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "itertools 0.14.0", + "log", + "recursive", ] [[package]] -name = "crossbeam-utils" -version = "0.8.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" +name = "datafusion-physical-plan" +version = "48.0.1" +source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" +dependencies = [ + "ahash", + "arrow", + "arrow-ord", + "arrow-schema 55.2.0", + "async-trait", + "chrono", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-window-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "futures", + "half", + "hashbrown 0.14.5", + "indexmap 2.13.1", + "itertools 0.14.0", + "log", + "parking_lot", + "pin-project-lite", + "tokio", +] [[package]] -name = "crossterm" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8b9f2e4c67f833b660cdb0a3523065869fb35570177239812ed4c905aeff87b" +name = "datafusion-proto" +version = "48.0.1" +source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ - "bitflags 2.10.0", - "crossterm_winapi", - "document-features", - "parking_lot", - "rustix 1.1.3", - "winapi", + "arrow", + "chrono", + "datafusion", + "datafusion-common", + "datafusion-expr", + "datafusion-proto-common", + "object_store", + "prost", ] [[package]] -name = "crossterm_winapi" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acdd7c62a3665c7f6830a51635d9ac9b23ed385797f70a83bb8bafe9c572ab2b" +name = "datafusion-proto-common" +version = "48.0.1" +source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ - "winapi", + "arrow", + "datafusion-common", + "prost", ] [[package]] -name = "crunchy" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" +name = "datafusion-session" +version = "48.0.1" +source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" +dependencies = [ + "arrow", + "async-trait", + "dashmap", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-plan", + "datafusion-sql", + "futures", + "itertools 0.14.0", + "log", + "object_store", + "parking_lot", + "tokio", +] [[package]] -name = "crypto-common" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" +name = "datafusion-sql" +version = "48.0.1" +source = "git+https://github.com/FunctionStream/datafusion?branch=48.0.1%2Ffs#c3b2ac13dabd4145f21045e315529b917079b7d1" dependencies = [ - "generic-array", - "typenum", + "arrow", + "bigdecimal", + "datafusion-common", + "datafusion-expr", + "indexmap 2.13.1", + "log", + "recursive", + "regex", + "sqlparser", ] [[package]] @@ -915,9 +1943,9 @@ dependencies = [ [[package]] name = "deranged" -version = "0.5.5" +version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ececcb659e7ba858fb4f10388c250a7252eb0a27373f1a72b8748afdd248e587" +checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" dependencies = [ "powerfmt", ] @@ -930,6 +1958,7 @@ checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ "block-buffer", "crypto-common", + "subtle", ] [[package]] @@ -1061,9 +2090,9 @@ checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649" [[package]] name = "fastrand" -version = "2.3.0" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" +checksum = "a043dc74da1e37d6afe657061213aa6f425f855399a11d3463c6ecccc4dfda1f" [[package]] name = "fd-lock" @@ -1072,15 +2101,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ce92ff622d6dadf7349484f42c93271a0d49b7cc4d466a936405bacbe10aa78" dependencies = [ "cfg-if", - "rustix 1.1.3", + "rustix 1.1.4", "windows-sys 0.59.0", ] [[package]] name = "find-msvc-tools" -version = "0.1.6" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "645cbb3a84e60b7531617d5ae4e57f7e27308f6445f5abf653209ea76dec8dff" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" [[package]] name = "fixedbitset" @@ -1104,6 +2133,27 @@ dependencies = [ "rustc_version", ] +[[package]] +name = "flatbuffers" +version = "25.12.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35f6839d7b3b98adde531effaf34f0c2badc6f4735d26fe74709d8e513a96ef3" +dependencies = [ + "bitflags 2.11.0", + "rustc_version", +] + +[[package]] +name = "flate2" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" +dependencies = [ + "crc32fast", + "miniz_oxide", + "zlib-rs", +] + [[package]] name = "fnv" version = "1.0.7" @@ -1132,7 +2182,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94e7099f6313ecacbe1256e8ff9d617b75d1bcb16a6fddef94866d225a01a14a" dependencies = [ "io-lifetimes", - "rustix 1.1.3", + "rustix 1.1.4", "windows-sys 0.59.0", ] @@ -1140,65 +2190,82 @@ dependencies = [ name = "function-stream" version = "0.6.0" dependencies = [ + "ahash", "anyhow", - "arrow-array", - "arrow-ipc", - "arrow-schema", + "arrow", + "arrow-array 55.2.0", + "arrow-ipc 55.2.0", + "arrow-json", + "arrow-schema 55.2.0", "async-trait", "base64", "bincode", - "clap", + "chrono", "crossbeam-channel", + "datafusion", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-proto", + "futures", + "governor", + "itertools 0.14.0", "log", "lru", "num_cpus", "parking_lot", - "pest", - "pest_derive", + "petgraph 0.7.1", "proctitle", + "prost", "protocol", + "rand 0.8.5", "rdkafka", "rocksdb", "serde", "serde_json", + "serde_json_path", "serde_yaml", - "thiserror 2.0.17", + "sqlparser", + "strum", + "thiserror 2.0.18", "tokio", "tokio-stream", "tonic", "tracing", "tracing-appender", "tracing-subscriber", + "unicase", "uuid", "wasmtime", "wasmtime-wasi", + "xxhash-rust", ] [[package]] name = "function-stream-cli" version = "0.1.0" dependencies = [ - "arrow-array", - "arrow-ipc", - "arrow-schema", + "arrow-array 52.2.0", + "arrow-ipc 52.2.0", + "arrow-schema 52.2.0", "clap", "comfy-table", - "function-stream", "protocol", "rustyline", - "thiserror 2.0.17", "tokio", "tonic", ] [[package]] name = "futures" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" +checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d" dependencies = [ "futures-channel", "futures-core", + "futures-executor", "futures-io", "futures-sink", "futures-task", @@ -1207,9 +2274,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" +checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d" dependencies = [ "futures-core", "futures-sink", @@ -1217,42 +2284,70 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.31" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" + +[[package]] +name = "futures-executor" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" +checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] [[package]] name = "futures-io" -version = "0.3.31" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718" + +[[package]] +name = "futures-macro" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" +checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] [[package]] name = "futures-sink" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" +checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893" [[package]] name = "futures-task" -version = "0.3.31" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" + +[[package]] +name = "futures-timer" +version = "3.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" +checksum = "f288b0a4f20f9a56b5d1da57e2227c661b7b16168e2f72365f57b63326e29b24" [[package]] name = "futures-util" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" +checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" dependencies = [ "futures-channel", "futures-core", "futures-io", + "futures-macro", "futures-sink", "futures-task", "memchr", "pin-project-lite", - "pin-utils", "slab", ] @@ -1262,9 +2357,9 @@ version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "25234f20a3ec0a962a61770cfe39ecf03cb529a6e474ad8cff025ed497eda557" dependencies = [ - "bitflags 2.10.0", + "bitflags 2.11.0", "debugid", - "rustc-hash 2.1.1", + "rustc-hash 2.1.2", "serde", "serde_derive", "serde_json", @@ -1282,9 +2377,9 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.16" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" +checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" dependencies = [ "cfg-if", "libc", @@ -1296,11 +2391,26 @@ name = "getrandom" version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "js-sys", + "libc", + "r-efi 5.3.0", + "wasip2", + "wasm-bindgen", +] + +[[package]] +name = "getrandom" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" dependencies = [ "cfg-if", "libc", - "r-efi", + "r-efi 6.0.0", "wasip2", + "wasip3", ] [[package]] @@ -1310,7 +2420,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e629b9b98ef3dd8afe6ca2bd0f89306cec16d43d907889945bc5d6687f2f13c7" dependencies = [ "fallible-iterator", - "indexmap 2.12.1", + "indexmap 2.13.1", "stable_deref_trait", ] @@ -1320,11 +2430,34 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" +[[package]] +name = "governor" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be93b4ec2e4710b04d9264c0c7350cdd62a8c20e5e4ac732552ebb8f0debe8eb" +dependencies = [ + "cfg-if", + "dashmap", + "futures-sink", + "futures-timer", + "futures-util", + "getrandom 0.3.4", + "no-std-compat", + "nonzero_ext", + "parking_lot", + "portable-atomic", + "quanta", + "rand 0.9.2", + "smallvec", + "spinning_top", + "web-time", +] + [[package]] name = "h2" -version = "0.4.12" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3c0b69cfcb4e1b9f1bf2f53f95f766e4661169728ec61cd3fe5a0166f2d1386" +checksum = "2f44da3a8150a6703ed5d34e164b875fd14c2cdab9af1252a9a1020bde2bdc54" dependencies = [ "atomic-waker", "bytes", @@ -1332,7 +2465,7 @@ dependencies = [ "futures-core", "futures-sink", "http", - "indexmap 2.12.1", + "indexmap 2.13.1", "slab", "tokio", "tokio-util", @@ -1362,6 +2495,10 @@ name = "hashbrown" version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +dependencies = [ + "ahash", + "allocator-api2", +] [[package]] name = "hashbrown" @@ -1393,6 +2530,12 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + [[package]] name = "home" version = "0.5.12" @@ -1455,9 +2598,9 @@ checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424" [[package]] name = "hyper" -version = "1.8.1" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ab2d4f250c3d7b1c9fcdff1cece94ea4e2dfbec68614f7b87cb205f24ca9d11" +checksum = "6299f016b246a94207e63da54dbe807655bf9e00044f73ded42c3ac5305fbcca" dependencies = [ "atomic-waker", "bytes", @@ -1470,7 +2613,6 @@ dependencies = [ "httpdate", "itoa", "pin-project-lite", - "pin-utils", "smallvec", "tokio", "want", @@ -1491,20 +2633,19 @@ dependencies = [ [[package]] name = "hyper-util" -version = "0.1.19" +version = "0.1.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "727805d60e7938b76b826a6ef209eb70eaa1812794f9424d4a4e2d740662df5f" +checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0" dependencies = [ "bytes", "futures-channel", - "futures-core", "futures-util", "http", "http-body", "hyper", "libc", "pin-project-lite", - "socket2 0.6.1", + "socket2 0.6.3", "tokio", "tower-service", "tracing", @@ -1512,9 +2653,9 @@ dependencies = [ [[package]] name = "iana-time-zone" -version = "0.1.64" +version = "0.1.65" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33e57f83510bb73707521ebaffa789ec8caf86f9657cad665b092b581d40e9fb" +checksum = "e31bc9ad994ba00e440a8aa5c9ef0ec67d5cb5e5cb0cc7f8b744a35b389cc470" dependencies = [ "android_system_properties", "core-foundation-sys", @@ -1536,12 +2677,13 @@ dependencies = [ [[package]] name = "icu_collections" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43" +checksum = "2984d1cd16c883d7935b9e07e44071dca8d917fd52ecc02c04d5fa0b5a3f191c" dependencies = [ "displaydoc", "potential_utf", + "utf8_iter", "yoke", "zerofrom", "zerovec", @@ -1549,9 +2691,9 @@ dependencies = [ [[package]] name = "icu_locale_core" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6" +checksum = "92219b62b3e2b4d88ac5119f8904c10f8f61bf7e95b640d25ba3075e6cac2c29" dependencies = [ "displaydoc", "litemap", @@ -1562,9 +2704,9 @@ dependencies = [ [[package]] name = "icu_normalizer" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599" +checksum = "c56e5ee99d6e3d33bd91c5d85458b6005a22140021cc324cea84dd0e72cff3b4" dependencies = [ "icu_collections", "icu_normalizer_data", @@ -1576,15 +2718,15 @@ dependencies = [ [[package]] name = "icu_normalizer_data" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a" +checksum = "da3be0ae77ea334f4da67c12f149704f19f81d1adf7c51cf482943e84a2bad38" [[package]] name = "icu_properties" -version = "2.1.2" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "020bfc02fe870ec3a66d93e677ccca0562506e5872c650f893269e08615d74ec" +checksum = "bee3b67d0ea5c2cca5003417989af8996f8604e34fb9ddf96208a033901e70de" dependencies = [ "icu_collections", "icu_locale_core", @@ -1596,15 +2738,15 @@ dependencies = [ [[package]] name = "icu_properties_data" -version = "2.1.2" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "616c294cf8d725c6afcd8f55abc17c56464ef6211f9ed59cccffe534129c77af" +checksum = "8e2bbb201e0c04f7b4b3e14382af113e17ba4f63e2c9d2ee626b720cbce54a14" [[package]] name = "icu_provider" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614" +checksum = "139c4cf31c8b5f33d7e199446eff9c1e02decfc2f0eec2c8d71f65befa45b421" dependencies = [ "displaydoc", "icu_locale_core", @@ -1617,9 +2759,9 @@ dependencies = [ [[package]] name = "id-arena" -version = "2.2.1" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25a2bc672d1148e28034f176e01fffebb08b35768468cc954630da77a1449005" +checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" [[package]] name = "idna" @@ -1649,7 +2791,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af1955a75fa080c677d3972822ec4bad316169ab1cfc6c257a942c2265dbe5fe" dependencies = [ "bitmaps", - "rand_core", + "rand_core 0.6.4", "rand_xoshiro", "sized-chunks", "typenum", @@ -1668,9 +2810,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.12.1" +version = "2.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ad4bb2b565bca0645f4d68c5c9af97fba094e9791da685bf83cb5f3ce74acf2" +checksum = "45a8a2b9cb3e0b0c1803dbb0758ffac5de2f425b23c28f518faabd9d805342ff" dependencies = [ "equivalent", "hashbrown 0.16.1", @@ -1678,6 +2820,21 @@ dependencies = [ "serde_core", ] +[[package]] +name = "integer-encoding" +version = "3.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" + +[[package]] +name = "inventory" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4f0c30c76f2f4ccee3fe55a2435f691ca00c0e4bd87abe4f4a851b1d4dac39b" +dependencies = [ + "rustversion", +] + [[package]] name = "io-extras" version = "0.18.4" @@ -1696,9 +2853,9 @@ checksum = "06432fb54d3be7964ecd3649233cddf80db2832f47fec34c01f65b3d9d774983" [[package]] name = "ipnet" -version = "2.11.0" +version = "2.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" +checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2" [[package]] name = "is-terminal" @@ -1737,9 +2894,9 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.17" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" [[package]] name = "ittapi" @@ -1773,10 +2930,12 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.83" +version = "0.3.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "464a3709c7f55f1f721e5389aa6ea4e3bc6aba669353300af094b29ffbdde1d8" +checksum = "2e04e2ef80ce82e13552136fabeef8a5ed1f985a96805761cbb9a2c34e7664d9" dependencies = [ + "cfg-if", + "futures-util", "once_cell", "wasm-bindgen", ] @@ -1811,11 +2970,24 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2cde5de06e8d4c2faabc400238f9ae1c74d5412d03a7bd067645ccbc47070e46" dependencies = [ - "lexical-parse-float", - "lexical-parse-integer", - "lexical-util", - "lexical-write-float", - "lexical-write-integer", + "lexical-parse-float 0.8.5", + "lexical-parse-integer 0.8.6", + "lexical-util 0.8.5", + "lexical-write-float 0.8.5", + "lexical-write-integer 0.8.5", +] + +[[package]] +name = "lexical-core" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d8d125a277f807e55a77304455eb7b1cb52f2b18c143b60e766c120bd64a594" +dependencies = [ + "lexical-parse-float 1.0.6", + "lexical-parse-integer 1.0.6", + "lexical-util 1.0.7", + "lexical-write-float 1.0.6", + "lexical-write-integer 1.0.6", ] [[package]] @@ -1824,21 +2996,40 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "683b3a5ebd0130b8fb52ba0bdc718cc56815b6a097e28ae5a6997d0ad17dc05f" dependencies = [ - "lexical-parse-integer", - "lexical-util", + "lexical-parse-integer 0.8.6", + "lexical-util 0.8.5", "static_assertions", ] +[[package]] +name = "lexical-parse-float" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52a9f232fbd6f550bc0137dcb5f99ab674071ac2d690ac69704593cb4abbea56" +dependencies = [ + "lexical-parse-integer 1.0.6", + "lexical-util 1.0.7", +] + [[package]] name = "lexical-parse-integer" version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6d0994485ed0c312f6d965766754ea177d07f9c00c9b82a5ee62ed5b47945ee9" dependencies = [ - "lexical-util", + "lexical-util 0.8.5", "static_assertions", ] +[[package]] +name = "lexical-parse-integer" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a7a039f8fb9c19c996cd7b2fcce303c1b2874fe1aca544edc85c4a5f8489b34" +dependencies = [ + "lexical-util 1.0.7", +] + [[package]] name = "lexical-util" version = "0.8.5" @@ -1848,32 +3039,57 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "lexical-util" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2604dd126bb14f13fb5d1bd6a66155079cb9fa655b37f875b3a742c705dbed17" + [[package]] name = "lexical-write-float" version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "accabaa1c4581f05a3923d1b4cfd124c329352288b7b9da09e766b0668116862" dependencies = [ - "lexical-util", - "lexical-write-integer", + "lexical-util 0.8.5", + "lexical-write-integer 0.8.5", "static_assertions", ] +[[package]] +name = "lexical-write-float" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50c438c87c013188d415fbabbb1dceb44249ab81664efbd31b14ae55dabb6361" +dependencies = [ + "lexical-util 1.0.7", + "lexical-write-integer 1.0.6", +] + [[package]] name = "lexical-write-integer" version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e1b6f3d1f4422866b68192d62f77bc5c700bee84f3069f2469d7bc8c77852446" dependencies = [ - "lexical-util", - "static_assertions", + "lexical-util 0.8.5", + "static_assertions", +] + +[[package]] +name = "lexical-write-integer" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "409851a618475d2d5796377cad353802345cba92c867d9fbcde9cf4eac4e14df" +dependencies = [ + "lexical-util 1.0.7", ] [[package]] name = "libc" -version = "0.2.179" +version = "0.2.184" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5a2d376baa530d1238d133232d15e239abad80d05838b4b59354e5268af431f" +checksum = "48f5d2a454e16a5ea0f4ced81bd44e4cfc7bd3a507b61887c99fd3538b28e4af" [[package]] name = "libloading" @@ -1887,17 +3103,16 @@ dependencies = [ [[package]] name = "libm" -version = "0.2.15" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de" +checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" [[package]] name = "libredox" -version = "0.1.12" +version = "0.1.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d0b95e02c851351f877147b7deea7b1afb1df71b63aa5f8270716e0c5720616" +checksum = "7ddbf48fd451246b1f8c2610bd3b4ac0cc6e149d89832867093ab69a17194f08" dependencies = [ - "bitflags 2.10.0", "libc", ] @@ -1919,9 +3134,9 @@ dependencies = [ [[package]] name = "libz-sys" -version = "1.1.23" +version = "1.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15d118bbf3771060e7311cc7bb0545b01d08a8b4a7de949198dec1fa0ca1c0f7" +checksum = "fc3a226e576f50782b3305c5ccf458698f92798987f551c6a02efe8276721e22" dependencies = [ "cc", "libc", @@ -1937,15 +3152,15 @@ checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" [[package]] name = "linux-raw-sys" -version = "0.11.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" [[package]] name = "litemap" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" +checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0" [[package]] name = "litrs" @@ -1987,6 +3202,26 @@ dependencies = [ "libc", ] +[[package]] +name = "lz4_flex" +version = "0.11.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "373f5eceeeab7925e0c1098212f2fbc4d416adec9d35051a6ab251e824c1854a" +dependencies = [ + "twox-hash", +] + +[[package]] +name = "lzma-sys" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "mach2" version = "0.4.3" @@ -2017,11 +3252,21 @@ version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4facc753ae494aeb6e3c22f839b158aebd4f9270f55cd3c79906c45476c47ab4" +[[package]] +name = "md-5" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +dependencies = [ + "cfg-if", + "digest", +] + [[package]] name = "memchr" -version = "2.7.6" +version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" [[package]] name = "memfd" @@ -2029,7 +3274,7 @@ version = "0.6.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ad38eb12aea514a0466ea40a80fd8cc83637065948eb4a426e4aa46261175227" dependencies = [ - "rustix 1.1.3", + "rustix 1.1.4", ] [[package]] @@ -2044,11 +3289,21 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" +[[package]] +name = "miniz_oxide" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" +dependencies = [ + "adler2", + "simd-adler32", +] + [[package]] name = "mio" -version = "1.1.1" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc" +checksum = "50b7e5b27aa02a74bac8c3f23f448f8d87ff11f92d3aac1a6ed369ee08cc56c1" dependencies = [ "libc", "wasi", @@ -2076,12 +3331,18 @@ version = "0.28.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab2156c4fce2f8df6c499cc1c763e4394b7482525bf2a9701c9d79d215f519e4" dependencies = [ - "bitflags 2.10.0", + "bitflags 2.11.0", "cfg-if", "cfg_aliases", "libc", ] +[[package]] +name = "no-std-compat" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b93853da6d84c2e3c7d730d6473e8817692dd89be387eb01b94d7f108ecb5b8c" + [[package]] name = "nom" version = "7.1.3" @@ -2092,6 +3353,12 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "nonzero_ext" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38bf9645c8b145698bb0b18a4637dcacbc421ea49bef2317e4fd8065a387cf21" + [[package]] name = "nu-ansi-term" version = "0.50.3" @@ -2136,9 +3403,9 @@ dependencies = [ [[package]] name = "num-conv" -version = "0.1.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" +checksum = "c6673768db2d862beb9b39a78fdcb1a69439615d5794a1be50caa9bc92c81967" [[package]] name = "num-integer" @@ -2193,9 +3460,9 @@ dependencies = [ [[package]] name = "num_enum" -version = "0.7.5" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1207a7e20ad57b847bbddc6776b968420d38292bbfe2089accff5e19e82454c" +checksum = "5d0bca838442ec211fa11de3a8b0e0e8f3a4522575b5c4c06ed722e005036f26" dependencies = [ "num_enum_derive", "rustversion", @@ -2203,9 +3470,9 @@ dependencies = [ [[package]] name = "num_enum_derive" -version = "0.7.5" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff32365de1b6743cb203b710788263c44a03de03802daf96092f2da4fe6ba4d7" +checksum = "680998035259dcfcafe653688bf2aa6d3e2dc05e98be6ab46afb089dc84f1df8" dependencies = [ "proc-macro-crate", "proc-macro2", @@ -2221,15 +3488,39 @@ checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe" dependencies = [ "crc32fast", "hashbrown 0.15.5", - "indexmap 2.12.1", + "indexmap 2.13.1", "memchr", ] +[[package]] +name = "object_store" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbfbfff40aeccab00ec8a910b57ca8ecf4319b335c542f2edcd19dd25a1e2a00" +dependencies = [ + "async-trait", + "bytes", + "chrono", + "futures", + "http", + "humantime", + "itertools 0.14.0", + "parking_lot", + "percent-encoding", + "thiserror 2.0.18", + "tokio", + "tracing", + "url", + "walkdir", + "wasm-bindgen-futures", + "web-time", +] + [[package]] name = "once_cell" -version = "1.21.3" +version = "1.21.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" [[package]] name = "once_cell_polyfill" @@ -2239,9 +3530,9 @@ checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" [[package]] name = "openssl-sys" -version = "0.9.111" +version = "0.9.112" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82cab2d520aa75e3c58898289429321eb788c3106963d0dc886ec7a5f4adc321" +checksum = "57d55af3b3e226502be1526dfdba67ab0e9c96fc293004e79576b2b9edb0dbdb" dependencies = [ "cc", "libc", @@ -2249,6 +3540,15 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "ordered-float" +version = "2.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c" +dependencies = [ + "num-traits", +] + [[package]] name = "os_pipe" version = "1.2.3" @@ -2282,6 +3582,48 @@ dependencies = [ "windows-link", ] +[[package]] +name = "parquet" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b17da4150748086bd43352bc77372efa9b6e3dbd06a04831d2a98c041c225cfa" +dependencies = [ + "ahash", + "arrow-array 55.2.0", + "arrow-buffer 55.2.0", + "arrow-cast 55.2.0", + "arrow-data 55.2.0", + "arrow-ipc 55.2.0", + "arrow-schema 55.2.0", + "arrow-select 55.2.0", + "base64", + "brotli", + "bytes", + "chrono", + "flate2", + "futures", + "half", + "hashbrown 0.15.5", + "lz4_flex", + "num", + "num-bigint", + "object_store", + "paste", + "seq-macro", + "simdutf8", + "snap", + "thrift", + "tokio", + "twox-hash", + "zstd", +] + +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + [[package]] name = "peeking_take_while" version = "0.1.2" @@ -2295,82 +3637,69 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" [[package]] -name = "pest" -version = "2.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c9eb05c21a464ea704b53158d358a31e6425db2f63a1a7312268b05fe2b75f7" -dependencies = [ - "memchr", - "ucd-trie", -] - -[[package]] -name = "pest_derive" -version = "2.8.5" +name = "petgraph" +version = "0.6.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68f9dbced329c441fa79d80472764b1a2c7e57123553b8519b36663a2fb234ed" +checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" dependencies = [ - "pest", - "pest_generator", + "fixedbitset 0.4.2", + "indexmap 2.13.1", ] [[package]] -name = "pest_generator" -version = "2.8.5" +name = "petgraph" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bb96d5051a78f44f43c8f712d8e810adb0ebf923fc9ed2655a7f66f63ba8ee5" +checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" dependencies = [ - "pest", - "pest_meta", - "proc-macro2", - "quote", - "syn", + "fixedbitset 0.5.7", + "indexmap 2.13.1", ] [[package]] -name = "pest_meta" -version = "2.8.5" +name = "petgraph" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "602113b5b5e8621770cfd490cfd90b9f84ab29bd2b0e49ad83eb6d186cef2365" +checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455" dependencies = [ - "pest", - "sha2", + "fixedbitset 0.5.7", + "hashbrown 0.15.5", + "indexmap 2.13.1", + "serde", ] [[package]] -name = "petgraph" -version = "0.6.5" +name = "phf" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" +checksum = "913273894cec178f401a31ec4b656318d95473527be05c0752cc41cdc32be8b7" dependencies = [ - "fixedbitset 0.4.2", - "indexmap 2.12.1", + "phf_shared", ] [[package]] -name = "petgraph" -version = "0.7.1" +name = "phf_shared" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" +checksum = "06005508882fb681fd97892ecff4b7fd0fee13ef1aa569f8695dae7ab9099981" dependencies = [ - "fixedbitset 0.5.7", - "indexmap 2.12.1", + "siphasher", ] [[package]] name = "pin-project" -version = "1.1.10" +version = "1.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a" +checksum = "f1749c7ed4bcaf4c3d0a3efc28538844fb29bcdd7d2b67b2be7e20ba861ff517" dependencies = [ "pin-project-internal", ] [[package]] name = "pin-project-internal" -version = "1.1.10" +version = "1.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861" +checksum = "d9b20ed30f105399776b9c883e68e536ef602a16ae6f596d2c473591d6ad64c6" dependencies = [ "proc-macro2", "quote", @@ -2379,15 +3708,9 @@ dependencies = [ [[package]] name = "pin-project-lite" -version = "0.2.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" - -[[package]] -name = "pin-utils" -version = "0.1.0" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" [[package]] name = "pkg-config" @@ -2395,6 +3718,12 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" +[[package]] +name = "portable-atomic" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" + [[package]] name = "postcard" version = "1.1.3" @@ -2409,9 +3738,9 @@ dependencies = [ [[package]] name = "potential_utf" -version = "0.1.4" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77" +checksum = "0103b1cef7ec0cf76490e969665504990193874ea05c85ff9bab8b911d0a0564" dependencies = [ "zerovec", ] @@ -2443,18 +3772,18 @@ dependencies = [ [[package]] name = "proc-macro-crate" -version = "3.4.0" +version = "3.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "219cb19e96be00ab2e37d6e299658a0cfa83e52429179969b0f0121b4ac46983" +checksum = "e67ba7e9b2b56446f1d419b1d807906278ffa1a658a8a5d8a39dcb1f5a78614f" dependencies = [ "toml_edit", ] [[package]] name = "proc-macro2" -version = "1.0.104" +version = "1.0.106" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9695f8df41bb4f3d222c95a67532365f569318332d03d5f3f67f37b20e6ebdf0" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" dependencies = [ "unicode-ident", ] @@ -2529,15 +3858,26 @@ dependencies = [ "env_logger", "log", "prost", + "serde", "tonic", "tonic-build", ] +[[package]] +name = "psm" +version = "0.1.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3852766467df634d74f0b2d7819bf8dc483a0eb2e3b0f50f756f9cfe8b0d18d8" +dependencies = [ + "ar_archive_writer", + "cc", +] + [[package]] name = "pulley-interpreter" -version = "41.0.3" +version = "41.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01051a5b172e07f9197b85060e6583b942aec679dac08416647bf7e7dc916b65" +checksum = "e9812652c1feb63cf39f8780cecac154a32b22b3665806c733cd4072547233a4" dependencies = [ "cranelift-bitset", "log", @@ -2547,20 +3887,35 @@ dependencies = [ [[package]] name = "pulley-macros" -version = "41.0.3" +version = "41.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2cf194f5b1a415ef3a44ee35056f4009092cc4038a9f7e3c7c1e392f48ee7dbb" +checksum = "56000349b6896e3d44286eb9c330891237f40b27fd43c1ccc84547d0b463cb40" dependencies = [ "proc-macro2", "quote", "syn", ] +[[package]] +name = "quanta" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3ab5a9d756f0d97bdc89019bd2e4ea098cf9cde50ee7564dde6b81ccc8f06c7" +dependencies = [ + "crossbeam-utils", + "libc", + "once_cell", + "raw-cpuid", + "wasi", + "web-sys", + "winapi", +] + [[package]] name = "quote" -version = "1.0.42" +version = "1.0.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" dependencies = [ "proc-macro2", ] @@ -2571,6 +3926,12 @@ version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" +[[package]] +name = "r-efi" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" + [[package]] name = "radix_trie" version = "0.2.1" @@ -2588,8 +3949,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ "libc", - "rand_chacha", - "rand_core", + "rand_chacha 0.3.1", + "rand_core 0.6.4", +] + +[[package]] +name = "rand" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" +dependencies = [ + "rand_chacha 0.9.0", + "rand_core 0.9.5", ] [[package]] @@ -2599,7 +3970,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", - "rand_core", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core 0.9.5", ] [[package]] @@ -2608,7 +3989,16 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom 0.2.16", + "getrandom 0.2.17", +] + +[[package]] +name = "rand_core" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom 0.3.4", ] [[package]] @@ -2617,7 +4007,16 @@ version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6f97cdb2a36ed4183de61b2f824cc45c9f1037f28afe0a322e9fff4c108b5aaa" dependencies = [ - "rand_core", + "rand_core 0.6.4", +] + +[[package]] +name = "raw-cpuid" +version = "11.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "498cd0dc59d73224351ee52a95fee0f1a617a2eae0e7d9d720cc622c73a54186" +dependencies = [ + "bitflags 2.11.0", ] [[package]] @@ -2660,11 +4059,12 @@ dependencies = [ [[package]] name = "rdkafka-sys" -version = "4.9.0+2.10.0" +version = "4.10.0+2.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5230dca48bc354d718269f3e4353280e188b610f7af7e2fcf54b7a79d5802872" +checksum = "e234cf318915c1059d4921ef7f75616b5219b10b46e9f3a511a15eb4b56a3f77" dependencies = [ "cmake", + "curl-sys", "libc", "libz-sys", "num_enum", @@ -2673,13 +4073,33 @@ dependencies = [ "sasl2-sys", ] +[[package]] +name = "recursive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e" +dependencies = [ + "recursive-proc-macro-impl", + "stacker", +] + +[[package]] +name = "recursive-proc-macro-impl" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" +dependencies = [ + "quote", + "syn", +] + [[package]] name = "redox_syscall" version = "0.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" dependencies = [ - "bitflags 2.10.0", + "bitflags 2.11.0", ] [[package]] @@ -2688,7 +4108,7 @@ version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43" dependencies = [ - "getrandom 0.2.16", + "getrandom 0.2.17", "libredox", "thiserror 1.0.69", ] @@ -2703,16 +4123,16 @@ dependencies = [ "bumpalo", "hashbrown 0.15.5", "log", - "rustc-hash 2.1.1", + "rustc-hash 2.1.2", "serde", "smallvec", ] [[package]] name = "regex" -version = "1.12.2" +version = "1.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" dependencies = [ "aho-corasick", "memchr", @@ -2722,9 +4142,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.13" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" dependencies = [ "aho-corasick", "memchr", @@ -2733,9 +4153,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.8.8" +version = "0.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" [[package]] name = "rocksdb" @@ -2749,9 +4169,9 @@ dependencies = [ [[package]] name = "rustc-demangle" -version = "0.1.26" +version = "0.1.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56f7d92ca342cea22a06f2121d944b4fd82af56988c270852495420f961d4ace" +checksum = "b50b8869d9fc858ce7266cce0194bd74df58b9d0e3f6df3a9fc8eb470d95c09d" [[package]] name = "rustc-hash" @@ -2761,9 +4181,9 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" [[package]] name = "rustc-hash" -version = "2.1.1" +version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" +checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe" [[package]] name = "rustc_version" @@ -2780,7 +4200,7 @@ version = "0.38.44" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" dependencies = [ - "bitflags 2.10.0", + "bitflags 2.11.0", "errno", "libc", "linux-raw-sys 0.4.15", @@ -2789,14 +4209,14 @@ dependencies = [ [[package]] name = "rustix" -version = "1.1.3" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "146c9e247ccc180c1f61615433868c99f3de3ae256a30a43b49f67c2d9171f34" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" dependencies = [ - "bitflags 2.10.0", + "bitflags 2.11.0", "errno", "libc", - "linux-raw-sys 0.11.0", + "linux-raw-sys 0.12.1", "windows-sys 0.61.2", ] @@ -2807,7 +4227,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2fc84bf7e9aa16c4f2c758f27412dc9841341e16aa682d9c7ac308fe3ee12056" dependencies = [ "once_cell", - "rustix 1.1.3", + "rustix 1.1.4", ] [[package]] @@ -2822,7 +4242,7 @@ version = "14.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7803e8936da37efd9b6d4478277f4b2b9bb5cdb37a113e8d63222e58da647e63" dependencies = [ - "bitflags 2.10.0", + "bitflags 2.11.0", "cfg-if", "clipboard-win", "fd-lock", @@ -2840,9 +4260,18 @@ dependencies = [ [[package]] name = "ryu" -version = "1.0.22" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" + +[[package]] +name = "same-file" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a50f4cf475b65d88e057964e0e9bb1f0aa9bbb2036dc65c64596b42932536984" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] [[package]] name = "sasl2-sys" @@ -2864,14 +4293,20 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "semver" -version = "1.0.27" +version = "1.0.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" +checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" dependencies = [ "serde", "serde_core", ] +[[package]] +name = "seq-macro" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc" + [[package]] name = "serde" version = "1.0.228" @@ -2904,9 +4339,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.148" +version = "1.0.149" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3084b546a1dd6289475996f182a22aba973866ea8e8b02c51d9f46b1336a22da" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" dependencies = [ "itoa", "memchr", @@ -2915,11 +4350,61 @@ dependencies = [ "zmij", ] +[[package]] +name = "serde_json_path" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b992cea3194eea663ba99a042d61cea4bd1872da37021af56f6a37e0359b9d33" +dependencies = [ + "inventory", + "nom", + "regex", + "serde", + "serde_json", + "serde_json_path_core", + "serde_json_path_macros", + "thiserror 2.0.18", +] + +[[package]] +name = "serde_json_path_core" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dde67d8dfe7d4967b5a95e247d4148368ddd1e753e500adb34b3ffe40c6bc1bc" +dependencies = [ + "inventory", + "serde", + "serde_json", + "thiserror 2.0.18", +] + +[[package]] +name = "serde_json_path_macros" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "517acfa7f77ddaf5c43d5f119c44a683774e130b4247b7d3210f8924506cfac8" +dependencies = [ + "inventory", + "serde_json_path_core", + "serde_json_path_macros_internal", +] + +[[package]] +name = "serde_json_path_macros_internal" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aafbefbe175fa9bf03ca83ef89beecff7d2a95aaacd5732325b90ac8c3bd7b90" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "serde_spanned" -version = "1.0.4" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8bbf91e5a4d6315eee45e704372590b30e260ee83af6639d64557f51b067776" +checksum = "6662b5879511e06e8999a8a235d848113e942c9124f211511b16466ee2995f26" dependencies = [ "serde_core", ] @@ -2930,7 +4415,7 @@ version = "0.9.34+deprecated" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" dependencies = [ - "indexmap 2.12.1", + "indexmap 2.13.1", "itoa", "ryu", "serde", @@ -2944,7 +4429,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" dependencies = [ "cfg-if", - "cpufeatures", + "cpufeatures 0.2.17", "digest", ] @@ -2996,14 +4481,32 @@ dependencies = [ ] [[package]] -name = "signal-hook-registry" -version = "1.4.8" +name = "signal-hook-registry" +version = "1.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4db69cba1110affc0e9f7bcd48bbf87b3f4fc7c61fc9155afd4c469eb3d6c1b" +dependencies = [ + "errno", + "libc", +] + +[[package]] +name = "simd-adler32" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214" + +[[package]] +name = "simdutf8" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" + +[[package]] +name = "siphasher" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4db69cba1110affc0e9f7bcd48bbf87b3f4fc7c61fc9155afd4c469eb3d6c1b" -dependencies = [ - "errno", - "libc", -] +checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e" [[package]] name = "sized-chunks" @@ -3017,9 +4520,9 @@ dependencies = [ [[package]] name = "slab" -version = "0.4.11" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a2ae44ef20feb57a68b23d846850f861394c2e02dc425a50098ae8c90267589" +checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" [[package]] name = "smallvec" @@ -3030,6 +4533,12 @@ dependencies = [ "serde", ] +[[package]] +name = "snap" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" + [[package]] name = "socket2" version = "0.5.10" @@ -3042,12 +4551,41 @@ dependencies = [ [[package]] name = "socket2" -version = "0.6.1" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17129e116933cf371d018bb80ae557e889637989d8638274fb25622827b03881" +checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e" dependencies = [ "libc", - "windows-sys 0.60.2", + "windows-sys 0.61.2", +] + +[[package]] +name = "spinning_top" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d96d2d1d716fb500937168cc09353ffdc7a012be8475ac7308e1bdf0e3923300" +dependencies = [ + "lock_api", +] + +[[package]] +name = "sqlparser" +version = "0.55.0" +source = "git+https://github.com/FunctionStream/sqlparser-rs?branch=0.58.0%2Ffs#c7b8376f307bb34d04b58ff7504820241e43da0e" +dependencies = [ + "log", + "recursive", + "sqlparser_derive", +] + +[[package]] +name = "sqlparser_derive" +version = "0.3.0" +source = "git+https://github.com/FunctionStream/sqlparser-rs?branch=0.58.0%2Ffs#c7b8376f307bb34d04b58ff7504820241e43da0e" +dependencies = [ + "proc-macro2", + "quote", + "syn", ] [[package]] @@ -3056,6 +4594,19 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" +[[package]] +name = "stacker" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d74a23609d509411d10e2176dc2a4346e3b4aea2e7b1869f19fdedbc71c013" +dependencies = [ + "cc", + "cfg-if", + "libc", + "psm", + "windows-sys 0.59.0", +] + [[package]] name = "static_assertions" version = "1.1.0" @@ -3068,11 +4619,39 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" +[[package]] +name = "strum" +version = "0.26.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" +dependencies = [ + "strum_macros", +] + +[[package]] +name = "strum_macros" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "rustversion", + "syn", +] + +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + [[package]] name = "syn" -version = "2.0.113" +version = "2.0.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "678faa00651c9eb72dd2020cbdf275d92eccb2400d568e419efdd64838145cb4" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" dependencies = [ "proc-macro2", "quote", @@ -3102,7 +4681,7 @@ version = "0.27.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cc4592f674ce18521c2a81483873a49596655b179f71c5e05d10c1fe66c78745" dependencies = [ - "bitflags 2.10.0", + "bitflags 2.11.0", "cap-fs-ext", "cap-std", "fd-lock", @@ -3114,20 +4693,20 @@ dependencies = [ [[package]] name = "target-lexicon" -version = "0.13.4" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1dd07eb858a2067e2f3c7155d54e929265c264e6f37efe3ee7a8d1b5a1dd0ba" +checksum = "adb6935a6f5c20170eeceb1a3835a49e12e19d792f6dd344ccc76a985ca5a6ca" [[package]] name = "tempfile" -version = "3.24.0" +version = "3.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "655da9c7eb6305c55742045d5a8d2037996d61d8de95806335c7c86ce0f82e9c" +checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" dependencies = [ "fastrand", - "getrandom 0.3.4", + "getrandom 0.4.2", "once_cell", - "rustix 1.1.3", + "rustix 1.1.4", "windows-sys 0.61.2", ] @@ -3151,11 +4730,11 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.17" +version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" dependencies = [ - "thiserror-impl 2.0.17", + "thiserror-impl 2.0.18", ] [[package]] @@ -3171,9 +4750,9 @@ dependencies = [ [[package]] name = "thiserror-impl" -version = "2.0.17" +version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" dependencies = [ "proc-macro2", "quote", @@ -3189,32 +4768,43 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "thrift" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" +dependencies = [ + "byteorder", + "integer-encoding", + "ordered-float", +] + [[package]] name = "time" -version = "0.3.44" +version = "0.3.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91e7d9e3bb61134e77bde20dd4825b97c010155709965fedf0f49bb138e52a9d" +checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c" dependencies = [ "deranged", "itoa", "num-conv", "powerfmt", - "serde", + "serde_core", "time-core", "time-macros", ] [[package]] name = "time-core" -version = "0.1.6" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40868e7c1d2f0b8d73e4a8c7f0ff63af4f6d19be117e90bd73eb1d62cf831c6b" +checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca" [[package]] name = "time-macros" -version = "0.2.24" +version = "0.2.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30cfb0125f12d9c277f35663a0a33f8c30190f4e4574868a330595412d34ebf3" +checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215" dependencies = [ "num-conv", "time-core", @@ -3231,9 +4821,9 @@ dependencies = [ [[package]] name = "tinystr" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869" +checksum = "c8323304221c2a851516f22236c5722a72eaa19749016521d6dff0824447d96d" dependencies = [ "displaydoc", "zerovec", @@ -3241,26 +4831,25 @@ dependencies = [ [[package]] name = "tokio" -version = "1.49.0" +version = "1.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72a2903cd7736441aac9df9d7688bd0ce48edccaadf181c3b90be801e81d3d86" +checksum = "2bd1c4c0fc4a7ab90fc15ef6daaa3ec3b893f004f915f2392557ed23237820cd" dependencies = [ "bytes", "libc", "mio", - "parking_lot", "pin-project-lite", "signal-hook-registry", - "socket2 0.6.1", + "socket2 0.6.3", "tokio-macros", "windows-sys 0.61.2", ] [[package]] name = "tokio-macros" -version = "2.6.0" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5" +checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" dependencies = [ "proc-macro2", "quote", @@ -3293,17 +4882,17 @@ dependencies = [ [[package]] name = "toml" -version = "0.9.11+spec-1.1.0" +version = "0.9.12+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3afc9a848309fe1aaffaed6e1546a7a14de1f935dc9d89d32afd9a44bab7c46" +checksum = "cf92845e79fc2e2def6a5d828f0801e29a2f8acc037becc5ab08595c7d5e9863" dependencies = [ - "indexmap 2.12.1", + "indexmap 2.13.1", "serde_core", "serde_spanned", - "toml_datetime", + "toml_datetime 0.7.5+spec-1.1.0", "toml_parser", "toml_writer", - "winnow", + "winnow 0.7.15", ] [[package]] @@ -3315,32 +4904,41 @@ dependencies = [ "serde_core", ] +[[package]] +name = "toml_datetime" +version = "1.1.1+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3165f65f62e28e0115a00b2ebdd37eb6f3b641855f9d636d3cd4103767159ad7" +dependencies = [ + "serde_core", +] + [[package]] name = "toml_edit" -version = "0.23.10+spec-1.0.0" +version = "0.25.10+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84c8b9f757e028cee9fa244aea147aab2a9ec09d5325a9b01e0a49730c2b5269" +checksum = "a82418ca169e235e6c399a84e395ab6debeb3bc90edc959bf0f48647c6a32d1b" dependencies = [ - "indexmap 2.12.1", - "toml_datetime", + "indexmap 2.13.1", + "toml_datetime 1.1.1+spec-1.1.0", "toml_parser", - "winnow", + "winnow 1.0.1", ] [[package]] name = "toml_parser" -version = "1.0.6+spec-1.1.0" +version = "1.1.2+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3198b4b0a8e11f09dd03e133c0280504d0801269e9afa46362ffde1cbeebf44" +checksum = "a2abe9b86193656635d2411dc43050282ca48aa31c2451210f4202550afb7526" dependencies = [ - "winnow", + "winnow 1.0.1", ] [[package]] name = "toml_writer" -version = "1.0.6+spec-1.1.0" +version = "1.1.1+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab16f14aed21ee8bfd8ec22513f7287cd4a91aa92e44edfe2c17ddd004e92607" +checksum = "756daf9b1013ebe47a8776667b466417e2d4c5679d441c26230efd9ef78692db" [[package]] name = "tonic" @@ -3397,7 +4995,7 @@ dependencies = [ "indexmap 1.9.3", "pin-project", "pin-project-lite", - "rand", + "rand 0.8.5", "slab", "tokio", "tokio-util", @@ -3408,9 +5006,9 @@ dependencies = [ [[package]] name = "tower" -version = "0.5.2" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" +checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" dependencies = [ "futures-core", "futures-util", @@ -3450,7 +5048,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "786d480bce6247ab75f005b14ae1624ad978d3029d9113f0a22fa1ac773faeaf" dependencies = [ "crossbeam-channel", - "thiserror 2.0.17", + "thiserror 2.0.18", "time", "tracing-subscriber", ] @@ -3487,36 +5085,22 @@ dependencies = [ "tracing-core", ] -[[package]] -name = "tracing-serde" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "704b1aeb7be0d0a84fc9828cae51dab5970fee5088f83d1dd7ee6f6246fc6ff1" -dependencies = [ - "serde", - "tracing-core", -] - [[package]] name = "tracing-subscriber" -version = "0.3.22" +version = "0.3.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f30143827ddab0d256fd843b7a66d164e9f271cfa0dde49142c5ca0ca291f1e" +checksum = "cb7f578e5945fb242538965c2d0b04418d38ec25c79d160cd279bf0731c8d319" dependencies = [ - "chrono", "matchers", "nu-ansi-term", "once_cell", "regex-automata", - "serde", - "serde_json", "sharded-slab", "smallvec", "thread_local", "tracing", "tracing-core", "tracing-log", - "tracing-serde", ] [[package]] @@ -3525,6 +5109,12 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" +[[package]] +name = "twox-hash" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c" + [[package]] name = "typenum" version = "1.19.0" @@ -3532,22 +5122,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" [[package]] -name = "ucd-trie" -version = "0.1.7" +name = "unicase" +version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971" +checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142" [[package]] name = "unicode-ident" -version = "1.0.22" +version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" [[package]] name = "unicode-segmentation" -version = "1.12.0" +version = "1.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" +checksum = "9629274872b2bfaf8d66f5f15725007f635594914870f65218920345aa11aa8c" [[package]] name = "unicode-width" @@ -3573,11 +5163,17 @@ version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861" +[[package]] +name = "unty" +version = "0.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d49784317cd0d1ee7ec5c716dd598ec5b4483ea832a2dced265471cc0f690ae" + [[package]] name = "url" -version = "2.5.7" +version = "2.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08bc136a29a3d1758e07a9cca267be308aeebf5cfd5a10f3f67ab2097683ef5b" +checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed" dependencies = [ "form_urlencoded", "idna", @@ -3599,11 +5195,11 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.19.0" +version = "1.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2e054861b4bd027cd373e18e8d8d8e6548085000e41290d95ce0c373a654b4a" +checksum = "5ac8b6f42ead25368cf5b098aeb3dc8a1a2c05a3eee8a9a1a68c640edbfc79d9" dependencies = [ - "getrandom 0.3.4", + "getrandom 0.4.2", "js-sys", "wasm-bindgen", ] @@ -3626,6 +5222,22 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "virtue" +version = "0.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "051eb1abcf10076295e815102942cc58f9d5e3b4560e46e53c21e8ff6f3af7b1" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + [[package]] name = "want" version = "0.3.1" @@ -3643,18 +5255,27 @@ checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" [[package]] name = "wasip2" -version = "1.0.1+wasi-0.2.4" +version = "1.0.2+wasi-0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasip3" +version = "0.4.0+wasi-0.3.0-rc-2026-01-06" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7" +checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" dependencies = [ "wit-bindgen", ] [[package]] name = "wasm-bindgen" -version = "0.2.106" +version = "0.2.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d759f433fa64a2d763d1340820e46e111a7a5ab75f993d1852d70b03dbb80fd" +checksum = "0551fc1bb415591e3372d0bc4780db7e587d84e2a7e79da121051c5c4b89d0b0" dependencies = [ "cfg-if", "once_cell", @@ -3663,11 +5284,21 @@ dependencies = [ "wasm-bindgen-shared", ] +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.67" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03623de6905b7206edd0a75f69f747f134b7f0a2323392d664448bf2d3c5d87e" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + [[package]] name = "wasm-bindgen-macro" -version = "0.2.106" +version = "0.2.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48cb0d2638f8baedbc542ed444afc0644a29166f1595371af4fecf8ce1e7eeb3" +checksum = "7fbdf9a35adf44786aecd5ff89b4563a90325f9da0923236f6104e603c7e86be" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -3675,9 +5306,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.106" +version = "0.2.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cefb59d5cd5f92d9dcf80e4683949f15ca4b511f4ac0a6e14d4e1ac60c6ecd40" +checksum = "dca9693ef2bab6d4e6707234500350d8dad079eb508dca05530c85dc3a529ff2" dependencies = [ "bumpalo", "proc-macro2", @@ -3688,9 +5319,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.106" +version = "0.2.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbc538057e648b67f72a982e708d485b2efa771e1ac05fec311f9f63e5800db4" +checksum = "39129a682a6d2d841b6c429d0c51e5cb0ed1a03829d8b3d1e69a011e62cb3d3b" dependencies = [ "unicode-ident", ] @@ -3704,15 +5335,15 @@ dependencies = [ "anyhow", "heck", "im-rc", - "indexmap 2.12.1", + "indexmap 2.13.1", "log", "petgraph 0.6.5", "serde", "serde_derive", "serde_yaml", "smallvec", - "wasm-encoder", - "wasmparser", + "wasm-encoder 0.243.0", + "wasmparser 0.243.0", "wat", ] @@ -3723,7 +5354,39 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c55db9c896d70bd9fa535ce83cd4e1f2ec3726b0edd2142079f594fc3be1cb35" dependencies = [ "leb128fmt", - "wasmparser", + "wasmparser 0.243.0", +] + +[[package]] +name = "wasm-encoder" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" +dependencies = [ + "leb128fmt", + "wasmparser 0.244.0", +] + +[[package]] +name = "wasm-encoder" +version = "0.246.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61fb705ce81adde29d2a8e99d87995e39a6e927358c91398f374474746070ef7" +dependencies = [ + "leb128fmt", + "wasmparser 0.246.2", +] + +[[package]] +name = "wasm-metadata" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" +dependencies = [ + "anyhow", + "indexmap 2.13.1", + "wasm-encoder 0.244.0", + "wasmparser 0.244.0", ] [[package]] @@ -3732,13 +5395,36 @@ version = "0.243.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f6d8db401b0528ec316dfbe579e6ab4152d61739cfe076706d2009127970159d" dependencies = [ - "bitflags 2.10.0", + "bitflags 2.11.0", "hashbrown 0.15.5", - "indexmap 2.12.1", + "indexmap 2.13.1", "semver", "serde", ] +[[package]] +name = "wasmparser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" +dependencies = [ + "bitflags 2.11.0", + "hashbrown 0.15.5", + "indexmap 2.13.1", + "semver", +] + +[[package]] +name = "wasmparser" +version = "0.246.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71cde4757396defafd25417cfb36aa3161027d06d865b0c24baaae229aac005d" +dependencies = [ + "bitflags 2.11.0", + "indexmap 2.13.1", + "semver", +] + [[package]] name = "wasmprinter" version = "0.243.0" @@ -3747,19 +5433,19 @@ checksum = "eb2b6035559e146114c29a909a3232928ee488d6507a1504d8934e8607b36d7b" dependencies = [ "anyhow", "termcolor", - "wasmparser", + "wasmparser 0.243.0", ] [[package]] name = "wasmtime" -version = "41.0.3" +version = "41.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a19f56cece843fa95dd929f5568ff8739c7e3873b530ceea9eda2aa02a0b4142" +checksum = "e2a83182bf04af87571b4c642300479501684f26bab5597f68f68cded5b098fd" dependencies = [ "addr2line", "anyhow", "async-trait", - "bitflags 2.10.0", + "bitflags 2.11.0", "bumpalo", "cc", "cfg-if", @@ -3768,7 +5454,7 @@ dependencies = [ "fxprof-processed-profile", "gimli", "hashbrown 0.15.5", - "indexmap 2.12.1", + "indexmap 2.13.1", "ittapi", "libc", "log", @@ -3779,7 +5465,7 @@ dependencies = [ "postcard", "pulley-interpreter", "rayon", - "rustix 1.1.3", + "rustix 1.1.4", "semver", "serde", "serde_derive", @@ -3788,8 +5474,8 @@ dependencies = [ "target-lexicon", "tempfile", "wasm-compose", - "wasm-encoder", - "wasmparser", + "wasm-encoder 0.243.0", + "wasmparser 0.243.0", "wasmtime-environ", "wasmtime-internal-cache", "wasmtime-internal-component-macro", @@ -3809,16 +5495,16 @@ dependencies = [ [[package]] name = "wasmtime-environ" -version = "41.0.3" +version = "41.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bf9dff572c950258548cbbaf39033f68f8dcd0b43b22e80def9fe12d532d3e5" +checksum = "cb201c41aa23a3642365cfb2e4a183573d85127a3c9d528f56b9997c984541ab" dependencies = [ "anyhow", "cpp_demangle", "cranelift-bitset", "cranelift-entity", "gimli", - "indexmap 2.12.1", + "indexmap 2.13.1", "log", "object", "postcard", @@ -3828,23 +5514,23 @@ dependencies = [ "serde_derive", "smallvec", "target-lexicon", - "wasm-encoder", - "wasmparser", + "wasm-encoder 0.243.0", + "wasmparser 0.243.0", "wasmprinter", "wasmtime-internal-component-util", ] [[package]] name = "wasmtime-internal-cache" -version = "41.0.3" +version = "41.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f52a985f5b5dae53147fc596f3a313c334e2c24fd1ba708634e1382f6ecd727" +checksum = "fb5b3069d1a67ba5969d0eb1ccd7e141367d4e713f4649aa90356c98e8f19bea" dependencies = [ "base64", "directories-next", "log", "postcard", - "rustix 1.1.3", + "rustix 1.1.4", "serde", "serde_derive", "sha2", @@ -3856,9 +5542,9 @@ dependencies = [ [[package]] name = "wasmtime-internal-component-macro" -version = "41.0.3" +version = "41.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7920dc7dcb608352f5fe93c52582e65075b7643efc5dac3fc717c1645a8d29a0" +checksum = "0c924400db7b6ca996fef1b23beb0f41d5c809836b1ec60fc25b4057e2d25d9b" dependencies = [ "anyhow", "proc-macro2", @@ -3866,20 +5552,20 @@ dependencies = [ "syn", "wasmtime-internal-component-util", "wasmtime-internal-wit-bindgen", - "wit-parser", + "wit-parser 0.243.0", ] [[package]] name = "wasmtime-internal-component-util" -version = "41.0.3" +version = "41.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "066f5aed35aa60580a2ac0df145c0f0d4b04319862fee1d6036693e1cca43a12" +checksum = "7d3f65daf4bf3d74ca2fbbe20af0589c42e2b398a073486451425d94fd4afef4" [[package]] name = "wasmtime-internal-cranelift" -version = "41.0.3" +version = "41.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afb8002dc415b7773d7949ee360c05ee8f91627ec25a7a0b01ee03831bdfdda1" +checksum = "633e889cdae76829738db0114ab3b02fce51ea4a1cd9675a67a65fce92e8b418" dependencies = [ "cfg-if", "cranelift-codegen", @@ -3894,8 +5580,8 @@ dependencies = [ "pulley-interpreter", "smallvec", "target-lexicon", - "thiserror 2.0.17", - "wasmparser", + "thiserror 2.0.18", + "wasmparser 0.243.0", "wasmtime-environ", "wasmtime-internal-math", "wasmtime-internal-unwinder", @@ -3904,14 +5590,14 @@ dependencies = [ [[package]] name = "wasmtime-internal-fiber" -version = "41.0.3" +version = "41.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f9c562c5a272bc9f615d8f0c085a4360bafa28eef9aa5947e63d204b1129b22" +checksum = "deb126adc5d0c72695cfb77260b357f1b81705a0f8fa30b3944e7c2219c17341" dependencies = [ "cc", "cfg-if", "libc", - "rustix 1.1.3", + "rustix 1.1.4", "wasmtime-environ", "wasmtime-internal-versioned-export-macros", "windows-sys 0.61.2", @@ -3919,21 +5605,21 @@ dependencies = [ [[package]] name = "wasmtime-internal-jit-debug" -version = "41.0.3" +version = "41.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db673148f26e1211db3913c12c75594be9e3858a71fa297561e9162b1a49cfb0" +checksum = "8e66ff7f90a8002187691ff6237ffd09f954a0ebb9de8b2ff7f5c62632134120" dependencies = [ "cc", "object", - "rustix 1.1.3", + "rustix 1.1.4", "wasmtime-internal-versioned-export-macros", ] [[package]] name = "wasmtime-internal-jit-icache-coherence" -version = "41.0.3" +version = "41.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bada5ca1cc47df7d14100e2254e187c2486b426df813cea2dd2553a7469f7674" +checksum = "4b96df23179ae16d54fb3a420f84ffe4383ec9dd06fad3e5bc782f85f66e8e08" dependencies = [ "anyhow", "cfg-if", @@ -3943,24 +5629,24 @@ dependencies = [ [[package]] name = "wasmtime-internal-math" -version = "41.0.3" +version = "41.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf6f615d528eda9adc6eefb062135f831b5215c348f4c3ec3e143690c730605b" +checksum = "86d1380926682b44c383e9a67f47e7a95e60c6d3fa8c072294dab2c7de6168a0" dependencies = [ "libm", ] [[package]] name = "wasmtime-internal-slab" -version = "41.0.3" +version = "41.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da169d4f789b586e1b2612ba8399c653ed5763edf3e678884ba785bb151d018f" +checksum = "9b63cbea1c0192c7feb7c0dfb35f47166988a3742f29f46b585ef57246c65764" [[package]] name = "wasmtime-internal-unwinder" -version = "41.0.3" +version = "41.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4888301f3393e4e8c75c938cce427293fade300fee3fc8fd466fdf3e54ae068e" +checksum = "f25c392c7e5fb891a7416e3c34cfbd148849271e8c58744fda875dde4bec4d6a" dependencies = [ "cfg-if", "cranelift-codegen", @@ -3971,9 +5657,9 @@ dependencies = [ [[package]] name = "wasmtime-internal-versioned-export-macros" -version = "41.0.3" +version = "41.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63ba3124cc2cbcd362672f9f077303ccc4cd61daa908f73447b7fdaece75ff9f" +checksum = "70f8b9796a3f0451a7b702508b303d654de640271ac80287176de222f187a237" dependencies = [ "proc-macro2", "quote", @@ -3982,16 +5668,16 @@ dependencies = [ [[package]] name = "wasmtime-internal-winch" -version = "41.0.3" +version = "41.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90a4182515dabba776656de4ebd62efad03399e261cf937ecccb838ce8823534" +checksum = "c0063e61f1d0b2c20e9cfc58361a6513d074a23c80b417aac3033724f51648a0" dependencies = [ "cranelift-codegen", "gimli", "log", "object", "target-lexicon", - "wasmparser", + "wasmparser 0.243.0", "wasmtime-environ", "wasmtime-internal-cranelift", "winch-codegen", @@ -3999,26 +5685,26 @@ dependencies = [ [[package]] name = "wasmtime-internal-wit-bindgen" -version = "41.0.3" +version = "41.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87acbd416227cdd279565ba49e57cf7f08d112657c3b3f39b70250acdfd094fe" +checksum = "587699ca7cae16b4a234ffcc834f37e75675933d533809919b52975f5609e2ef" dependencies = [ "anyhow", - "bitflags 2.10.0", + "bitflags 2.11.0", "heck", - "indexmap 2.12.1", - "wit-parser", + "indexmap 2.13.1", + "wit-parser 0.243.0", ] [[package]] name = "wasmtime-wasi" -version = "41.0.3" +version = "41.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9a1bdb4948463ed22559a640e687fed0df50b66353144aa6a9496c041ecd927" +checksum = "fc2eb9dc95baed3cd86fdfebf9f9f333337eb308bf8bd973e0c7b06d9418c35f" dependencies = [ "anyhow", "async-trait", - "bitflags 2.10.0", + "bitflags 2.11.0", "bytes", "cap-fs-ext", "cap-net-ext", @@ -4029,9 +5715,9 @@ dependencies = [ "futures", "io-extras", "io-lifetimes", - "rustix 1.1.3", + "rustix 1.1.4", "system-interface", - "thiserror 2.0.17", + "thiserror 2.0.18", "tokio", "tracing", "url", @@ -4043,9 +5729,9 @@ dependencies = [ [[package]] name = "wasmtime-wasi-io" -version = "41.0.3" +version = "41.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7873d8b990d3cf1105ef491abf2b3cf1e19ff6722d24d5ca662026ea082cdff" +checksum = "a0b8402f1e04385071fdd96aca97cba995d7376b572e42ce5841d5b6aaf6fa30" dependencies = [ "anyhow", "async-trait", @@ -4065,35 +5751,55 @@ dependencies = [ [[package]] name = "wast" -version = "243.0.0" +version = "246.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df21d01c2d91e46cb7a221d79e58a2d210ea02020d57c092e79255cc2999ca7f" +checksum = "fe3fe8e3bf88ad96d031b4181ddbd64634b17cb0d06dfc3de589ef43591a9a62" dependencies = [ "bumpalo", "leb128fmt", "memchr", "unicode-width 0.2.2", - "wasm-encoder", + "wasm-encoder 0.246.2", ] [[package]] name = "wat" -version = "1.243.0" +version = "1.246.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bd7fda1199b94fff395c2d19a153f05dbe7807630316fa9673367666fd2ad8c" +dependencies = [ + "wast 246.0.2", +] + +[[package]] +name = "web-sys" +version = "0.3.94" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd70027e39b12f0849461e08ffc50b9cd7688d942c1c8e3c7b22273236b4dd0a" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "web-time" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "226a9a91cd80a50449312fef0c75c23478fcecfcc4092bdebe1dc8e760ef521b" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" dependencies = [ - "wast 243.0.0", + "js-sys", + "wasm-bindgen", ] [[package]] name = "wiggle" -version = "41.0.3" +version = "41.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f05d2a9932ca235984248dc98471ae83d1985e095682d049af4c296f54f0fb4" +checksum = "a69a60bcbe1475c5dc9ec89210ade54823d44f742e283cba64f98f89697c4cec" dependencies = [ "anyhow", - "bitflags 2.10.0", - "thiserror 2.0.17", + "bitflags 2.11.0", + "thiserror 2.0.18", "tracing", "wasmtime", "wiggle-macro", @@ -4101,9 +5807,9 @@ dependencies = [ [[package]] name = "wiggle-generate" -version = "41.0.3" +version = "41.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57f773d51c1696bd7d028aa35c884d9fc58f48d79a1176dfbad6c908de314235" +checksum = "21f3dc0fd4dcfc7736434bb216179a2147835309abc09bf226736a40d484548f" dependencies = [ "anyhow", "heck", @@ -4115,9 +5821,9 @@ dependencies = [ [[package]] name = "wiggle-macro" -version = "41.0.3" +version = "41.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e976fe0cecd60041f66b15ad45ebc997952af13da9bf9d90261c7b025057edc" +checksum = "fea2aea744eded58ae092bf57110c27517dab7d5a300513ff13897325c5c5021" dependencies = [ "proc-macro2", "quote", @@ -4158,9 +5864,9 @@ checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "winch-codegen" -version = "41.0.3" +version = "41.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4f31dcfdfaf9d6df9e1124d7c8ee6fc29af5b99b89d11ae731c138e0f5bd77b" +checksum = "c55de3ac5b8bd71e5f6c87a9e511dd3ceb194bdb58183c6a7bf21cd8c0e46fbc" dependencies = [ "anyhow", "cranelift-assembler-x64", @@ -4169,8 +5875,8 @@ dependencies = [ "regalloc2", "smallvec", "target-lexicon", - "thiserror 2.0.17", - "wasmparser", + "thiserror 2.0.18", + "wasmparser 0.243.0", "wasmtime-environ", "wasmtime-internal-cranelift", "wasmtime-internal-math", @@ -4402,9 +6108,15 @@ checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" [[package]] name = "winnow" -version = "0.7.14" +version = "0.7.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945" + +[[package]] +name = "winnow" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a5364e9d77fcdeeaa6062ced926ee3381faa2ee02d3eb83a5c27a8825540829" +checksum = "09dac053f1cd375980747450bfc7250c264eaae0583872e845c0c7cd578872b5" dependencies = [ "memchr", ] @@ -4415,15 +6127,79 @@ version = "0.36.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f3fd376f71958b862e7afb20cfe5a22830e1963462f3a17f49d82a6c1d1f42d" dependencies = [ - "bitflags 2.10.0", + "bitflags 2.11.0", "windows-sys 0.59.0", ] [[package]] name = "wit-bindgen" -version = "0.46.0" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +dependencies = [ + "wit-bindgen-rust-macro", +] + +[[package]] +name = "wit-bindgen-core" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" +dependencies = [ + "anyhow", + "heck", + "wit-parser 0.244.0", +] + +[[package]] +name = "wit-bindgen-rust" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" +dependencies = [ + "anyhow", + "heck", + "indexmap 2.13.1", + "prettyplease", + "syn", + "wasm-metadata", + "wit-bindgen-core", + "wit-component", +] + +[[package]] +name = "wit-bindgen-rust-macro" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" +dependencies = [ + "anyhow", + "prettyplease", + "proc-macro2", + "quote", + "syn", + "wit-bindgen-core", + "wit-bindgen-rust", +] + +[[package]] +name = "wit-component" +version = "0.244.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" +checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" +dependencies = [ + "anyhow", + "bitflags 2.11.0", + "indexmap 2.13.1", + "log", + "serde", + "serde_derive", + "serde_json", + "wasm-encoder 0.244.0", + "wasm-metadata", + "wasmparser 0.244.0", + "wit-parser 0.244.0", +] [[package]] name = "wit-parser" @@ -4433,14 +6209,32 @@ checksum = "df983a8608e513d8997f435bb74207bf0933d0e49ca97aa9d8a6157164b9b7fc" dependencies = [ "anyhow", "id-arena", - "indexmap 2.12.1", + "indexmap 2.13.1", + "log", + "semver", + "serde", + "serde_derive", + "serde_json", + "unicode-xid", + "wasmparser 0.243.0", +] + +[[package]] +name = "wit-parser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" +dependencies = [ + "anyhow", + "id-arena", + "indexmap 2.13.1", "log", "semver", "serde", "serde_derive", "serde_json", "unicode-xid", - "wasmparser", + "wasmparser 0.244.0", ] [[package]] @@ -4457,15 +6251,30 @@ dependencies = [ [[package]] name = "writeable" -version = "0.6.2" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" + +[[package]] +name = "xxhash-rust" +version = "0.8.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" + +[[package]] +name = "xz2" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" +checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" +dependencies = [ + "lzma-sys", +] [[package]] name = "yoke" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954" +checksum = "abe8c5fda708d9ca3df187cae8bfb9ceda00dd96231bed36e445a1a48e66f9ca" dependencies = [ "stable_deref_trait", "yoke-derive", @@ -4474,9 +6283,9 @@ dependencies = [ [[package]] name = "yoke-derive" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" +checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e" dependencies = [ "proc-macro2", "quote", @@ -4486,18 +6295,18 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.31" +version = "0.8.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd74ec98b9250adb3ca554bdde269adf631549f51d8a8f8f0a10b50f1cb298c3" +checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.31" +version = "0.8.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8a8d209fdf45cf5138cbb5a506f6b52522a25afccc534d1475dad8e31105c6a" +checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" dependencies = [ "proc-macro2", "quote", @@ -4506,18 +6315,18 @@ dependencies = [ [[package]] name = "zerofrom" -version = "0.1.6" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" +checksum = "69faa1f2a1ea75661980b013019ed6687ed0e83d069bc1114e2cc74c6c04c4df" dependencies = [ "zerofrom-derive", ] [[package]] name = "zerofrom-derive" -version = "0.1.6" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" +checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1" dependencies = [ "proc-macro2", "quote", @@ -4527,9 +6336,9 @@ dependencies = [ [[package]] name = "zerotrie" -version = "0.2.3" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851" +checksum = "0f9152d31db0792fa83f70fb2f83148effb5c1f5b8c7686c3459e361d9bc20bf" dependencies = [ "displaydoc", "yoke", @@ -4538,9 +6347,9 @@ dependencies = [ [[package]] name = "zerovec" -version = "0.11.5" +version = "0.11.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002" +checksum = "90f911cbc359ab6af17377d242225f4d75119aec87ea711a880987b18cd7b239" dependencies = [ "yoke", "zerofrom", @@ -4549,20 +6358,26 @@ dependencies = [ [[package]] name = "zerovec-derive" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" +checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555" dependencies = [ "proc-macro2", "quote", "syn", ] +[[package]] +name = "zlib-rs" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513" + [[package]] name = "zmij" -version = "1.0.10" +version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30e0d8dffbae3d840f64bda38e28391faef673a7b5a6017840f2a106c8145868" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" [[package]] name = "zstd" diff --git a/Cargo.toml b/Cargo.toml index 4b855aa9..87d4ea03 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,14 +20,14 @@ path = "src/main.rs" [dependencies] -tokio = { version = "1.0", features = ["full"] } +tokio = { version = "1.0", features = ["macros", "rt-multi-thread", "sync", "time", "net", "signal"] } serde = { version = "1.0", features = ["derive"] } serde_yaml = "0.9" serde_json = "1.0" uuid = { version = "1.0", features = ["v4"] } log = "0.4" tracing = "0.1" -tracing-subscriber = { version = "0.3", features = ["env-filter", "json", "chrono"] } +tracing-subscriber = { version = "0.3", features = ["env-filter"] } tracing-appender = "0.2" anyhow = "1.0" thiserror = "2" @@ -35,23 +35,44 @@ tonic = { version = "0.12", features = ["default"] } async-trait = "0.1" num_cpus = "1.0" protocol = { path = "./protocol" } -rdkafka = { version = "0.38", features = ["cmake-build", "ssl", "gssapi"] } +prost = "0.13" +rdkafka = { version = "0.38", features = ["cmake-build", "ssl", "gssapi", "curl"] } crossbeam-channel = "0.5" -pest = "2.7" -pest_derive = "2.7" -clap = { version = "4.5", features = ["derive"] } wasmtime = { version = "41.0.3", features = ["component-model", "async"] } base64 = "0.22" wasmtime-wasi = "41.0.3" rocksdb = { version = "0.21", features = ["multi-threaded-cf", "lz4"] } -bincode = "1.3" +bincode = { version = "2", features = ["serde"] } +chrono = "0.4" tokio-stream = "0.1.18" lru = "0.12" parking_lot = "0.12" -arrow-array = "52" -arrow-ipc = "52" -arrow-schema = "52" +arrow = { version = "55", default-features = false } +arrow-array = "55" +arrow-ipc = "55" +arrow-schema = { version = "55", features = ["serde"] } +futures = "0.3" +serde_json_path = "0.7" +xxhash-rust = { version = "0.8", features = ["xxh3"] } proctitle = "0.1" +unicase = "2.7" +petgraph = "0.7" +rand = { version = "0.8", features = ["small_rng"] } +itertools = "0.14" +strum = { version = "0.26", features = ["derive"] } + +arrow-json = {version = '55.2.0'} +datafusion = {git = 'https://github.com/FunctionStream/datafusion', branch = '48.0.1/fs'} +datafusion-common = {git = 'https://github.com/FunctionStream/datafusion', branch = '48.0.1/fs'} +datafusion-execution = {git = 'https://github.com/FunctionStream/datafusion', branch = '48.0.1/fs'} +datafusion-expr = {git = 'https://github.com/FunctionStream/datafusion', branch = '48.0.1/fs'} +datafusion-physical-expr = {git = 'https://github.com/FunctionStream/datafusion', branch = '48.0.1/fs'} +datafusion-proto = {git = 'https://github.com/FunctionStream/datafusion', branch = '48.0.1/fs'} + +sqlparser = { git = "https://github.com/FunctionStream/sqlparser-rs", branch = "0.58.0/fs" } + +ahash = "0.8" +governor = "0.8.0" [features] default = ["incremental-cache", "python"] diff --git a/Dockerfile b/Dockerfile index a580c5d8..7c8ac205 100644 --- a/Dockerfile +++ b/Dockerfile @@ -21,6 +21,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ clang \ libclang-dev \ libssl-dev \ + libcurl4-openssl-dev \ pkg-config \ libsasl2-dev \ protobuf-compiler \ diff --git a/README-zh.md b/README-zh.md index b1d68eac..05fd5fc2 100644 --- a/README-zh.md +++ b/README-zh.md @@ -23,7 +23,7 @@ [中文](README-zh.md) | [English](README.md) -**Function Stream** 是一个基于 Rust 构建的高性能、事件驱动的流处理框架。它提供了一个模块化的运行时,用于编排编译为 **WebAssembly (WASM)** 的 Serverless 风格处理函数,支持使用 **Go、Python 和 Rust** 编写函数。 +**Function Stream** 是一个基于 Rust 构建的高性能、事件驱动的流处理框架。它提供了一个模块化的运行时,用于编排编译为 **WebAssembly (WASM)** 的 Serverless 风格处理函数,支持使用 **Go、Python 和 Rust** 编写函数。同时内置 **Streaming SQL** 引擎,可通过纯声明式 SQL 构建实时数据管道 — 包括时间窗口聚合、多流关联和持续 ETL。 ## 目录 @@ -46,6 +46,7 @@ ## 核心特性 +- **Streaming SQL 引擎**:使用纯 SQL 构建实时管道 — 注册数据源(`CREATE TABLE`)、启动持续计算(`CREATE STREAMING TABLE ... AS SELECT`)、管理生命周期(`SHOW` / `DROP`)。支持滚动窗口、滑动窗口、窗口关联等丰富语义。 - **事件驱动的 WASM 运行时**:以接近原生的性能和沙箱隔离的方式执行多语言函数(Go、Python、Rust)。 - **持久化状态管理**:内置支持基于 RocksDB 的状态存储,用于有状态流处理。 - **SQL 驱动的 CLI**:使用类 SQL 命令进行作业管理和流检测的交互式 REPL。 @@ -71,7 +72,7 @@ function-stream/ - **Rust 工具链**:Stable >= 1.77 (通过 rustup 安装)。 - **Python 3.9+**:构建 Python WASM 运行时所需。 - **Protoc**:Protocol Buffers 编译器(用于生成 gRPC 绑定)。 -- **构建工具**:cmake, pkg-config, OpenSSL headers (用于 rdkafka)。 +- **构建工具**:cmake、pkg-config、OpenSSL 头文件,以及 **libcurl** 开发头文件(Debian/Ubuntu 上为 `libcurl4-openssl-dev`)——在使用带 SSL/OAuth 相关选项构建 **rdkafka** 时,捆绑的 librdkafka 需要 `curl/curl.h`。 ## 快速开始 (本地开发) @@ -200,14 +201,16 @@ function-stream-/ ## 文档 -| 文档 | 描述 | -|------------------------------------------------------|---------------| -| [服务端配置与运维指南](docs/server-configuration-zh.md) | 服务端配置与运维操作 | -| [Function 任务配置规范](docs/function-configuration-zh.md) | 任务定义规范 | -| [SQL CLI 交互式管理指南](docs/sql-cli-guide-zh.md) | 交互式管理指南 | -| [Function 管理与开发指南](docs/function-development-zh.md) | 管理与开发指南 | -| [Go SDK 开发与交互指南](docs/Go-SDK/go-sdk-guide-zh.md) | Go SDK 指南 | -| [Python SDK 开发与交互指南](docs/Python-SDK/python-sdk-guide-zh.md) | Python SDK 指南 | +| 文档 | 描述 | +|------------------------------------------------------------------------|--------------------------| +| [Streaming SQL 使用指南](docs/streaming-sql-guide-zh.md) | 声明式 SQL 实时流处理指南 | +| [连接器、格式与类型参考](docs/connectors-and-formats-zh.md) | 支持的 Source/Sink、格式与数据类型 | +| [服务端配置与运维指南](docs/server-configuration-zh.md) | 服务端配置与运维操作 | +| [Function 任务配置规范](docs/function-configuration-zh.md) | 任务定义规范 | +| [SQL CLI 交互式管理指南](docs/sql-cli-guide-zh.md) | 交互式管理指南 | +| [Function 管理与开发指南](docs/function-development-zh.md) | 管理与开发指南 | +| [Go SDK 开发与交互指南](docs/Go-SDK/go-sdk-guide-zh.md) | Go SDK 指南 | +| [Python SDK 开发与交互指南](docs/Python-SDK/python-sdk-guide-zh.md) | Python SDK 指南 | ## 配置 diff --git a/README.md b/README.md index 51a69de1..34e1400b 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ [中文](README-zh.md) | [English](README.md) -**Function Stream** is a high-performance, event-driven stream processing framework built in Rust. It provides a modular runtime to orchestrate serverless-style processing functions compiled to **WebAssembly (WASM)**, supporting functions written in **Go, Python, and Rust**. +**Function Stream** is a high-performance, event-driven stream processing framework built in Rust. It provides a modular runtime to orchestrate serverless-style processing functions compiled to **WebAssembly (WASM)**, supporting functions written in **Go, Python, and Rust**. It also features a **Streaming SQL** engine that lets you build real-time data pipelines — including time-windowed aggregations, multi-stream joins, and continuous ETL — using pure declarative SQL. ## Table of Contents @@ -46,6 +46,7 @@ ## Key Features +* **Streaming SQL Engine**: Build real-time pipelines with pure SQL — register sources (`CREATE TABLE`), launch continuous computations (`CREATE STREAMING TABLE ... AS SELECT`), and manage lifecycle (`SHOW` / `DROP`). Supports tumbling windows, hopping windows, window joins, and more. * **Event-Driven WASM Runtime**: Executes polyglot functions (Go, Python, Rust) with near-native performance and sandboxed isolation. * **Durable State Management**: Built-in support for RocksDB-backed state stores for stateful stream processing. * **SQL-Powered CLI**: Interactive REPL for job management and stream inspection using SQL-like commands. @@ -72,7 +73,7 @@ function-stream/ * **Rust Toolchain**: Stable >= 1.77 (via rustup). * **Python 3.9+**: Required for building the Python WASM runtime. * **Protoc**: Protocol Buffers compiler (for generating gRPC bindings). -* **Build Tools**: cmake, pkg-config, OpenSSL headers (for rdkafka). +* **Build Tools**: cmake, pkg-config, OpenSSL headers, **libcurl** development headers (`libcurl4-openssl-dev` on Debian/Ubuntu) — required when building **rdkafka** with SSL/OAuth-related options (bundled librdkafka expects `curl/curl.h`). ## Quick Start (Local Development) @@ -199,14 +200,16 @@ We provide a robust shell script to manage the server process, capable of handli ## Documentation -| Document | Description | -|----------------------------------------------------------|-----------------------------------| -| [Server Configuration](docs/server-configuration.md) | Server Configuration & Operations | -| [Function Configuration](docs/function-configuration.md) | Task Definition Specification | -| [SQL CLI Guide](docs/sql-cli-guide.md) | Interactive Management Guide | -| [Function Development](docs/function-development.md) | Management & Development Guide | -| [Go SDK Guide](docs/Go-SDK/go-sdk-guide.md) | Go SDK Guide | -| [Python SDK Guide](docs/Python-SDK/python-sdk-guide.md) | Python SDK Guide | +| Document | Description | +|----------------------------------------------------------------|-------------------------------------------------| +| [Streaming SQL Guide](docs/streaming-sql-guide.md) | Declarative SQL for Real-Time Stream Processing | +| [Connectors, Formats & Types](docs/connectors-and-formats.md) | Supported Sources, Sinks, Formats & Data Types | +| [Server Configuration](docs/server-configuration.md) | Server Configuration & Operations | +| [Function Configuration](docs/function-configuration.md) | Task Definition Specification | +| [SQL CLI Guide](docs/sql-cli-guide.md) | Interactive Management Guide | +| [Function Development](docs/function-development.md) | Management & Development Guide | +| [Go SDK Guide](docs/Go-SDK/go-sdk-guide.md) | Go SDK Guide | +| [Python SDK Guide](docs/Python-SDK/python-sdk-guide.md) | Python SDK Guide | ## Configuration diff --git a/cli/cli/Cargo.toml b/cli/cli/Cargo.toml index 72352995..49c0a881 100644 --- a/cli/cli/Cargo.toml +++ b/cli/cli/Cargo.toml @@ -12,11 +12,9 @@ arrow-array = "52" arrow-ipc = "52" arrow-schema = "52" comfy-table = "7" -function-stream = { path = "../../" } protocol = { path = "../../protocol" } clap = { version = "4.5", features = ["derive"] } -thiserror = "2" -tokio = { version = "1.0", features = ["full", "signal"] } +tokio = { version = "1.0", features = ["macros", "rt-multi-thread", "sync", "signal"] } tonic = { version = "0.12", features = ["default"] } rustyline = { version = "14.0", features = ["with-dirs"] } diff --git a/cli/cli/src/repl.rs b/cli/cli/src/repl.rs index 7f8087b3..b442bd07 100644 --- a/cli/cli/src/repl.rs +++ b/cli/cli/src/repl.rs @@ -20,26 +20,59 @@ use comfy_table::{Attribute, Cell, Color, ContentArrangement, Table, TableCompon use protocol::cli::{function_stream_service_client::FunctionStreamServiceClient, SqlRequest}; use rustyline::error::ReadlineError; use rustyline::{Config, DefaultEditor, EditMode}; +use std::fmt; use std::io::{self, Cursor, Write}; use std::sync::Arc; use tokio::sync::Mutex; use tonic::Request; -#[derive(Debug, thiserror::Error)] +/// CLI errors. +/// +/// **Important:** [`tonic::Status`] must not be formatted with `{}` — its [`fmt::Display`] dumps +/// `details` / `metadata` (e.g. HTTP headers). Only [`tonic::Status::message`] is stored in +/// [`ReplError::Rpc`]. +#[derive(Debug)] pub enum ReplError { - #[error("RPC error: {0}")] - Rpc(Box), - #[error("Connection failed: {0}")] + Rpc(String), Connection(String), - #[error("Internal error: {0}")] Internal(String), - #[error("IO error: {0}")] - Io(#[from] io::Error), + Io(io::Error), +} + +impl fmt::Display for ReplError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + ReplError::Rpc(s) => f.write_str(s), + ReplError::Connection(s) => f.write_str(s), + ReplError::Internal(s) => write!(f, "Internal error: {s}"), + ReplError::Io(e) => write!(f, "IO error: {e}"), + } + } +} + +impl std::error::Error for ReplError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self { + ReplError::Io(e) => Some(e), + _ => None, + } + } +} + +impl From for ReplError { + fn from(e: io::Error) -> Self { + ReplError::Io(e) + } } impl From for ReplError { fn from(s: tonic::Status) -> Self { - ReplError::Rpc(Box::new(s)) + let msg = s.message(); + if msg.is_empty() { + ReplError::Rpc(format!("gRPC {} (server returned no message)", s.code())) + } else { + ReplError::Rpc(msg.to_string()) + } } } diff --git a/conf/config.yaml b/conf/config.yaml index 3f19493d..9d0f625e 100644 --- a/conf/config.yaml +++ b/conf/config.yaml @@ -117,3 +117,10 @@ task_storage: # Maximum bytes for level base in bytes (optional) max_bytes_for_level_base: 268435456 + +# Stream table catalog (SQL: CREATE TABLE connector sources, SHOW TABLES, SHOW CREATE TABLE). +# When persist is true (default), metadata is stored under RocksDB at db_path (default: data/stream_catalog) +# and reloaded after process restart. Set persist: false only for tests/ephemeral nodes. +stream_catalog: + persist: true + # db_path: data/stream_catalog diff --git a/docs/connectors-and-formats-zh.md b/docs/connectors-and-formats-zh.md new file mode 100644 index 00000000..8f25a7dc --- /dev/null +++ b/docs/connectors-and-formats-zh.md @@ -0,0 +1,197 @@ + + +# 连接器、数据格式与 SQL 类型参考 + +[中文](connectors-and-formats-zh.md) | [English](connectors-and-formats.md) + +本文档是 Function Stream Streaming SQL 引擎所支持的连接器(Source / Sink)、序列化格式以及 SQL 数据类型的权威参考。 + +--- + +## 目录 + +- [1. 连接器 (Connector)](#1-连接器-connector) + - [1.1 Kafka Source(数据源)](#11-kafka-source数据源) + - [1.2 Kafka Sink(数据汇)](#12-kafka-sink数据汇) +- [2. 数据格式 (Format)](#2-数据格式-format) +- [3. SQL 数据类型](#3-sql-数据类型) +- [4. 完整示例](#4-完整示例) + +--- + +## 1. 连接器 (Connector) + +当前 Function Stream 支持 **Kafka** 作为生产可用的连接器,同时可作为数据源(Source)和数据汇(Sink)。 + +### 1.1 Kafka Source(数据源) + +Kafka Source 从一个或多个 Kafka Topic 分区读取消息。在 `CREATE TABLE` 中使用以注册输入流。 + +**必填属性:** + +| 属性 | 说明 | 示例 | +|------|------|------| +| `connector` | 必须为 `kafka`。 | `'kafka'` | +| `topic` | 要消费的 Kafka Topic。 | `'raw_events'` | +| `format` | 消息的序列化格式。 | `'json'` | +| `bootstrap.servers` | Kafka Broker 地址列表,逗号分隔。 | `'broker1:9092,broker2:9092'` | + +**示例:** + +```sql +CREATE TABLE page_views ( + user_id VARCHAR, + page_url VARCHAR, + view_time TIMESTAMP NOT NULL, + WATERMARK FOR view_time AS view_time - INTERVAL '3' SECOND +) WITH ( + 'connector' = 'kafka', + 'topic' = 'page_views', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +); +``` + +### 1.2 Kafka Sink(数据汇) + +Kafka Sink 将计算结果写入 Kafka Topic。在 `CREATE STREAMING TABLE` 的 `WITH` 子句中配置。 + +**必填属性:** + +| 属性 | 说明 | 示例 | +|------|------|------| +| `connector` | 必须为 `kafka`。 | `'kafka'` | +| `topic` | 要写入的 Kafka Topic。 | `'sink_results'` | +| `format` | 输出消息的序列化格式。 | `'json'` | +| `bootstrap.servers` | Kafka Broker 地址列表。 | `'broker1:9092'` | + +**示例:** + +```sql +CREATE STREAMING TABLE enriched_clicks WITH ( + 'connector' = 'kafka', + 'topic' = 'enriched_clicks', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +) AS +SELECT click_id, user_id, click_time +FROM ad_clicks; +``` + +--- + +## 2. 数据格式 (Format) + +当前唯一支持的序列化格式是 **JSON**。每条 Kafka 消息应为一个自描述的 JSON 对象,其字段直接映射到 `CREATE TABLE` 中定义的列。 + +在 `WITH` 子句中设置 `'format' = 'json'`(省略时也默认为 JSON)。 + +--- + +## 3. SQL 数据类型 + +以下是 `CREATE TABLE` 列定义中支持的 SQL 数据类型: + +### 数值类型 + +| SQL 类型 | 别名 | Arrow 类型 | 说明 | +|----------|------|-----------|------| +| `BOOLEAN` | `BOOL` | Boolean | 布尔值。 | +| `TINYINT` | — | Int8 | 8 位有符号整数。 | +| `SMALLINT` | `INT2` | Int16 | 16 位有符号整数。 | +| `INT` | `INTEGER`、`INT4` | Int32 | 32 位有符号整数。 | +| `BIGINT` | `INT8` | Int64 | 64 位有符号整数。 | +| `TINYINT UNSIGNED` | — | UInt8 | 8 位无符号整数。 | +| `SMALLINT UNSIGNED` | `INT2 UNSIGNED` | UInt16 | 16 位无符号整数。 | +| `INT UNSIGNED` | `INT4 UNSIGNED` | UInt32 | 32 位无符号整数。 | +| `BIGINT UNSIGNED` | `INT8 UNSIGNED` | UInt64 | 64 位无符号整数。 | +| `FLOAT` | `REAL`、`FLOAT4` | Float32 | 32 位 IEEE 754 浮点数。 | +| `DOUBLE` | `DOUBLE PRECISION`、`FLOAT8` | Float64 | 64 位 IEEE 754 浮点数。 | +| `DECIMAL(p, s)` | `NUMERIC(p, s)` | Decimal128 | 定点小数。精度 1–38,标度 <= 精度。 | + +### 字符串与二进制类型 + +| SQL 类型 | 别名 | Arrow 类型 | 说明 | +|----------|------|-----------|------| +| `VARCHAR` | `TEXT`、`STRING`、`CHAR` | Utf8 | 可变长度 UTF-8 字符串。 | +| `BYTEA` | — | Binary | 可变长度字节数组。 | +| `JSON` | — | Utf8(JSON 扩展) | 带有 FunctionStream 扩展元数据的 JSON 类型字符串。 | + +### 日期与时间类型 + +| SQL 类型 | Arrow 类型 | 说明 | +|----------|-----------|------| +| `TIMESTAMP` | Timestamp(Nanosecond) | 不含时区的日期时间(纳秒精度)。 | +| `TIMESTAMP(0)` | Timestamp(Second) | 秒精度。 | +| `TIMESTAMP(3)` | Timestamp(Millisecond) | 毫秒精度。 | +| `TIMESTAMP(6)` | Timestamp(Microsecond) | 微秒精度。 | +| `TIMESTAMP(9)` | Timestamp(Nanosecond) | 纳秒精度(与 `TIMESTAMP` 相同)。 | +| `DATE` | Date32 | 日历日期(年、月、日)。 | +| `DATETIME` | Timestamp(Nanosecond) | `TIMESTAMP` 的别名。 | +| `TIME` | Time64(Nanosecond) | 不含时区的时刻。 | +| `INTERVAL` | Interval(MonthDayNano) | 时间间隔 / 持续时间。 | + +### 复合类型 + +| SQL 类型 | Arrow 类型 | 说明 | +|----------|-----------|------| +| `STRUCT` | Struct | 命名组合字段。 | +| `ARRAY` | List | 相同类型元素的有序列表。也支持 `element_type[]` 语法。 | + +--- + +## 4. 完整示例 + +以下是一个结合 Kafka Source、Kafka Sink、JSON 格式和多种 SQL 数据类型的完整示例: + +```sql +-- Source:从 Kafka 读取用户活动事件 +CREATE TABLE user_activity ( + event_id VARCHAR, + user_id BIGINT, + action VARCHAR, + amount DECIMAL(10, 2), + tags ARRAY, + event_time TIMESTAMP NOT NULL, + WATERMARK FOR event_time AS event_time - INTERVAL '5' SECOND +) WITH ( + 'connector' = 'kafka', + 'topic' = 'user_activity', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +); + +-- Sink:1 分钟滚动窗口聚合 +CREATE STREAMING TABLE activity_stats_1m WITH ( + 'connector' = 'kafka', + 'topic' = 'activity_stats_1m', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +) AS +SELECT + TUMBLE(INTERVAL '1' MINUTE) AS time_window, + action, + COUNT(*) AS event_count, + SUM(amount) AS total_amount +FROM user_activity +GROUP BY 1, action; +``` diff --git a/docs/connectors-and-formats.md b/docs/connectors-and-formats.md new file mode 100644 index 00000000..46d0d964 --- /dev/null +++ b/docs/connectors-and-formats.md @@ -0,0 +1,197 @@ + + +# Connectors, Formats & Data Types + +[中文](connectors-and-formats-zh.md) | [English](connectors-and-formats.md) + +This document is the authoritative reference for connectors (sources & sinks), serialization formats, and SQL data types supported by Function Stream's Streaming SQL engine. + +--- + +## Table of Contents + +- [1. Connectors](#1-connectors) + - [1.1 Kafka (Source)](#11-kafka-source) + - [1.2 Kafka (Sink)](#12-kafka-sink) +- [2. Data Format](#2-data-format) +- [3. SQL Data Types](#3-sql-data-types) +- [4. Full Example](#4-full-example) + +--- + +## 1. Connectors + +Currently Function Stream supports **Kafka** as the production-ready connector for both source (ingestion) and sink (egress). + +### 1.1 Kafka (Source) + +A Kafka source reads records from one or more Kafka topic partitions. Use it in `CREATE TABLE` to register an input stream. + +**Required Properties:** + +| Property | Description | Example | +|----------|-------------|---------| +| `connector` | Must be `kafka`. | `'kafka'` | +| `topic` | Kafka topic to consume from. | `'raw_events'` | +| `format` | Serialization format of messages. | `'json'` | +| `bootstrap.servers` | Comma-separated list of Kafka broker addresses. | `'broker1:9092,broker2:9092'` | + +**Example:** + +```sql +CREATE TABLE page_views ( + user_id VARCHAR, + page_url VARCHAR, + view_time TIMESTAMP NOT NULL, + WATERMARK FOR view_time AS view_time - INTERVAL '3' SECOND +) WITH ( + 'connector' = 'kafka', + 'topic' = 'page_views', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +); +``` + +### 1.2 Kafka (Sink) + +A Kafka sink writes records into a Kafka topic. It is configured in the `WITH` clause of a `CREATE STREAMING TABLE` statement. + +**Required Properties:** + +| Property | Description | Example | +|----------|-------------|---------| +| `connector` | Must be `kafka`. | `'kafka'` | +| `topic` | Kafka topic to write to. | `'sink_results'` | +| `format` | Serialization format of output messages. | `'json'` | +| `bootstrap.servers` | Comma-separated Kafka broker addresses. | `'broker1:9092'` | + +**Example:** + +```sql +CREATE STREAMING TABLE enriched_clicks WITH ( + 'connector' = 'kafka', + 'topic' = 'enriched_clicks', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +) AS +SELECT click_id, user_id, click_time +FROM ad_clicks; +``` + +--- + +## 2. Data Format + +Currently the only supported serialization format is **JSON**. Each Kafka message is expected to be a self-describing JSON object whose fields map directly to the columns defined in `CREATE TABLE`. + +Set `'format' = 'json'` in the `WITH` clause (this is also the default when omitted). + +--- + +## 3. SQL Data Types + +The following SQL data types are supported in `CREATE TABLE` column definitions: + +### Numeric Types + +| SQL Type | Aliases | Arrow Type | Description | +|----------|---------|------------|-------------| +| `BOOLEAN` | `BOOL` | Boolean | True / false. | +| `TINYINT` | — | Int8 | 8-bit signed integer. | +| `SMALLINT` | `INT2` | Int16 | 16-bit signed integer. | +| `INT` | `INTEGER`, `INT4` | Int32 | 32-bit signed integer. | +| `BIGINT` | `INT8` | Int64 | 64-bit signed integer. | +| `TINYINT UNSIGNED` | — | UInt8 | 8-bit unsigned integer. | +| `SMALLINT UNSIGNED` | `INT2 UNSIGNED` | UInt16 | 16-bit unsigned integer. | +| `INT UNSIGNED` | `INT4 UNSIGNED` | UInt32 | 32-bit unsigned integer. | +| `BIGINT UNSIGNED` | `INT8 UNSIGNED` | UInt64 | 64-bit unsigned integer. | +| `FLOAT` | `REAL`, `FLOAT4` | Float32 | 32-bit IEEE 754 floating point. | +| `DOUBLE` | `DOUBLE PRECISION`, `FLOAT8` | Float64 | 64-bit IEEE 754 floating point. | +| `DECIMAL(p, s)` | `NUMERIC(p, s)` | Decimal128 | Fixed-point decimal. Precision 1–38, scale <= precision. | + +### String & Binary Types + +| SQL Type | Aliases | Arrow Type | Description | +|----------|---------|------------|-------------| +| `VARCHAR` | `TEXT`, `STRING`, `CHAR` | Utf8 | Variable-length UTF-8 string. | +| `BYTEA` | — | Binary | Variable-length byte array. | +| `JSON` | — | Utf8 (JSON extension) | JSON-typed string with FunctionStream extension metadata. | + +### Date & Time Types + +| SQL Type | Arrow Type | Description | +|----------|------------|-------------| +| `TIMESTAMP` | Timestamp(Nanosecond) | Date and time without timezone (nanosecond precision). | +| `TIMESTAMP(0)` | Timestamp(Second) | Second precision. | +| `TIMESTAMP(3)` | Timestamp(Millisecond) | Millisecond precision. | +| `TIMESTAMP(6)` | Timestamp(Microsecond) | Microsecond precision. | +| `TIMESTAMP(9)` | Timestamp(Nanosecond) | Nanosecond precision (same as `TIMESTAMP`). | +| `DATE` | Date32 | Calendar date (year, month, day). | +| `DATETIME` | Timestamp(Nanosecond) | Alias for `TIMESTAMP`. | +| `TIME` | Time64(Nanosecond) | Time of day without timezone. | +| `INTERVAL` | Interval(MonthDayNano) | Time duration / interval. | + +### Composite Types + +| SQL Type | Arrow Type | Description | +|----------|------------|-------------| +| `STRUCT` | Struct | Named composite fields. | +| `ARRAY` | List | Ordered list of elements of the same type. Also supports `element_type[]` syntax. | + +--- + +## 4. Full Example + +Below is a complete example combining a Kafka source, a Kafka sink, JSON format, and various SQL data types: + +```sql +-- Source: user activity events from Kafka +CREATE TABLE user_activity ( + event_id VARCHAR, + user_id BIGINT, + action VARCHAR, + amount DECIMAL(10, 2), + tags ARRAY, + event_time TIMESTAMP NOT NULL, + WATERMARK FOR event_time AS event_time - INTERVAL '5' SECOND +) WITH ( + 'connector' = 'kafka', + 'topic' = 'user_activity', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +); + +-- Sink: 1-minute tumbling window aggregation +CREATE STREAMING TABLE activity_stats_1m WITH ( + 'connector' = 'kafka', + 'topic' = 'activity_stats_1m', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +) AS +SELECT + TUMBLE(INTERVAL '1' MINUTE) AS time_window, + action, + COUNT(*) AS event_count, + SUM(amount) AS total_amount +FROM user_activity +GROUP BY 1, action; +``` diff --git a/docs/sql-cli-guide-zh.md b/docs/sql-cli-guide-zh.md index 8352dea1..bff05932 100644 --- a/docs/sql-cli-guide-zh.md +++ b/docs/sql-cli-guide-zh.md @@ -129,7 +129,69 @@ DROP FUNCTION go_processor_demo; --- -## 三、REPL 内建辅助指令 +## 三、Streaming SQL:TABLE 与 STREAMING TABLE + +除了 Function 管理之外,CLI 还支持一整套 **Streaming SQL** 命令,用于声明数据源和构建实时管道。完整示例请参阅 [Streaming SQL 使用指南](streaming-sql-guide-zh.md)。 + +### 3.1 注册数据源:CREATE TABLE + +声明外部数据源(如 Kafka),包含 Schema、事件时间和水位线策略。此操作仅创建**静态目录条目**,不消耗计算资源。 + +```sql +CREATE TABLE ad_impressions ( + impression_id VARCHAR, + ad_id BIGINT, + campaign_id BIGINT, + user_id VARCHAR, + impression_time TIMESTAMP NOT NULL, + WATERMARK FOR impression_time AS impression_time - INTERVAL '2' SECOND +) WITH ( + 'connector' = 'kafka', + 'topic' = 'raw_ad_impressions', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +); +``` + +### 3.2 创建流计算管道:CREATE STREAMING TABLE + +使用 CTAS 语法启动持续运行的分布式计算管道。结果以纯追加模式写入目标连接器。 + +```sql +CREATE STREAMING TABLE metric_tumble_impressions_1m WITH ( + 'connector' = 'kafka', + 'topic' = 'sink_impressions_1m', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +) AS +SELECT + TUMBLE(INTERVAL '1' MINUTE) AS time_window, + campaign_id, + COUNT(*) AS total_impressions +FROM ad_impressions +GROUP BY 1, campaign_id; +``` + +### 3.3 查看与监控 + +| 命令 | 说明 | +|------|------| +| `SHOW TABLES` | 列出所有已注册的数据源表。 | +| `SHOW CREATE TABLE ` | 显示某张表的建表 DDL。 | +| `SHOW STREAMING TABLES` | 列出所有正在运行的流计算管道及其状态。 | +| `SHOW CREATE STREAMING TABLE ` | 查看某条管道的物理执行拓扑图(ASCII 格式)。 | + +### 3.4 销毁流计算管道:DROP STREAMING TABLE + +停止并释放某条流计算管道的所有资源: + +```sql +DROP STREAMING TABLE metric_tumble_impressions_1m; +``` + +--- + +## 四、REPL 内建辅助指令 在 `function-stream>` 提示符下,支持以下便捷指令: @@ -141,7 +203,7 @@ DROP FUNCTION go_processor_demo; --- -## 四、技术约束与注意事项 +## 五、技术约束与注意事项 - **路径隔离**:SQL CLI 本身不负责上传文件。function_path 指向的文件必须预先存在于**服务端机器**的磁盘上。若需远程上传打包,请使用 Python SDK。 - **Python 函数限制**:由于 Python 函数涉及动态依赖分析与代码打包,目前**不支持**通过 SQL CLI 创建,仅能通过 CLI 进行 START / STOP / SHOW 等生命周期管理。 diff --git a/docs/sql-cli-guide.md b/docs/sql-cli-guide.md index be42a37e..a7f36a88 100644 --- a/docs/sql-cli-guide.md +++ b/docs/sql-cli-guide.md @@ -129,7 +129,69 @@ DROP FUNCTION go_processor_demo; --- -## 3. REPL Built-in Auxiliary Commands +## 3. Streaming SQL: TABLE & STREAMING TABLE + +In addition to Function management, the CLI supports a full set of **Streaming SQL** commands for declaring data sources and building real-time pipelines. For a comprehensive guide with examples, see [Streaming SQL Guide](streaming-sql-guide.md). + +### 3.1 Register Data Source: CREATE TABLE + +Declare an external data source (e.g. Kafka) with schema, event time, and watermark strategy. This creates a **static catalog entry** that consumes no compute resources. + +```sql +CREATE TABLE ad_impressions ( + impression_id VARCHAR, + ad_id BIGINT, + campaign_id BIGINT, + user_id VARCHAR, + impression_time TIMESTAMP NOT NULL, + WATERMARK FOR impression_time AS impression_time - INTERVAL '2' SECOND +) WITH ( + 'connector' = 'kafka', + 'topic' = 'raw_ad_impressions', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +); +``` + +### 3.2 Create Streaming Pipeline: CREATE STREAMING TABLE + +Launch a continuous, distributed compute pipeline using CTAS syntax. Results are written to the target connector in append-only mode. + +```sql +CREATE STREAMING TABLE metric_tumble_impressions_1m WITH ( + 'connector' = 'kafka', + 'topic' = 'sink_impressions_1m', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +) AS +SELECT + TUMBLE(INTERVAL '1' MINUTE) AS time_window, + campaign_id, + COUNT(*) AS total_impressions +FROM ad_impressions +GROUP BY 1, campaign_id; +``` + +### 3.3 Inspect & Monitor + +| Command | Description | +|---------|-------------| +| `SHOW TABLES` | List all registered source tables. | +| `SHOW CREATE TABLE ` | Display the DDL of a registered table. | +| `SHOW STREAMING TABLES` | List all running streaming pipelines with status. | +| `SHOW CREATE STREAMING TABLE ` | Inspect the physical execution graph (ASCII topology). | + +### 3.4 Destroy Streaming Pipeline: DROP STREAMING TABLE + +Stop and release all resources for a streaming pipeline: + +```sql +DROP STREAMING TABLE metric_tumble_impressions_1m; +``` + +--- + +## 4. REPL Built-in Auxiliary Commands At the `function-stream>` prompt, the following convenient commands are supported: @@ -141,7 +203,7 @@ At the `function-stream>` prompt, the following convenient commands are supporte --- -## 4. Technical Constraints and Notes +## 5. Technical Constraints and Notes - **Path Isolation**: The SQL CLI itself is not responsible for uploading files. The file pointed to by function_path must pre-exist on the **Server machine's** disk. If remote upload packaging is required, please use the Python SDK. - **Python Function Limitations**: Since Python functions involve dynamic dependency analysis and code packaging, they are currently **not supported** for creation via SQL CLI; only lifecycle management such as START / STOP / SHOW via CLI is supported. diff --git a/docs/streaming-sql-guide-zh.md b/docs/streaming-sql-guide-zh.md new file mode 100644 index 00000000..5721971c --- /dev/null +++ b/docs/streaming-sql-guide-zh.md @@ -0,0 +1,408 @@ + + +# Function Stream:Streaming SQL 使用指南 + +[中文](streaming-sql-guide-zh.md) | [English](streaming-sql-guide.md) + +Function Stream 提供了声明式 SQL 接口来构建实时流处理管道。通过 Streaming SQL,您可以轻松应对无界数据流(Unbounded Data)的摄取、时间窗口聚合、流式关联以及任务生命周期管理 —— **全程无需编写任何命令式代码**。 + +### 数据流与管道鸟瞰 + +```mermaid +flowchart LR + subgraph In["数据源(TABLE)"] + S["Kafka 等外部系统"] + end + subgraph Pipe["持续查询(STREAMING TABLE)"] + P["SQL 逻辑计划"] + W["窗口 / Join / 聚合"] + end + subgraph Out["结果写出"] + K["Sink 连接器"] + end + S --> P --> W --> K +``` + +> **阅读提示**:下图各节与「目录导览」中的锚点一一对应;遇到规划报错时,以 SQL 引擎返回信息为准。 + +--- + +## 目录导览 + +| 章节 | 说明 | +|------|------| +| [1. SQL 语法兼容性](#1-sql-语法兼容性) | DataFusion + Function Stream 扩展 | +| [2. 核心概念](#2-核心概念) | TABLE / STREAMING TABLE / 事件时间 / 水位线 | +| [3. 查询语法结构](#3-查询语法结构) | `WITH` → `SELECT` → `FROM` → `JOIN` → … | +| [4. JOIN 语法与支持](#4-join-语法与支持) | 写法、语义矩阵、`ON` 深度说明 | +| [实战一:注册数据源 (TABLE)](#实战一注册数据源-table) | `CREATE TABLE` + `WATERMARK` + `WITH` | +| [实战二:构建实时 Pipeline](#实战二构建实时-pipeline-streaming-table) | 四个完整场景 SQL | +| [实战三:生命周期与流任务管理](#实战三生命周期与流任务管理) | `SHOW` / `DROP STREAMING TABLE` | +| [SQL 语法速查表](#sql-语法速查表) | 常用语句一页表 | + +--- + + + +## 1. SQL 语法兼容性 + +> **核心概括**:Function Stream 的语法体系可以理解为 **「DataFusion SQL + Function Stream 流式 DDL 扩展」**。 + +| 层次 | 技术要点 | +|------|----------| +| **解析与规划** | SQL 经 **sqlparser** 解析,使用 Function Stream 定制的 **`FunctionStreamDialect`**,再由 **Apache DataFusion** 的 SQL 前端(`SqlToRel`)与逻辑规划器生成执行计划。 | +| **查询(`SELECT …`)语法** | 遵循 **DataFusion SQL**,整体接近 **ANSI SQL**,风格上多偏 **PostgreSQL**(标识符、常用函数、`JOIN` / `WHERE` / `GROUP BY` 等)。 | +| **兼容边界** | **不是**完整的 PostgreSQL 兼容。DataFusion 不接受的写法,或被流式改写器明确禁止的用法(例如对**无界流**进行全局 `ORDER BY` / `LIMIT`),会在**规划阶段**直接报错拦截。 | +| **流式 / 目录 DDL** | 例如 `WATERMARK FOR`、`CREATE STREAMING TABLE … AS SELECT`、`SHOW STREAMING TABLES`、`DROP STREAMING TABLE`,以及 `CREATE TABLE` 上的连接器 `WITH ('key' = 'value', …)` 等,均属于 **Function Stream 独有扩展**。 | + +--- + + + +## 2. 核心概念 + +在开始编写 SQL 前,请先理解以下四个支撑流处理的核心概念: + +| 概念 | SQL 关键字 | 说明 | +|------|------------|------| +| **逻辑表 (TABLE)** | `CREATE TABLE` | 数据的「目录项」:注册在系统 **Catalog** 中的**静态定义**,仅记录外部数据源的连接信息、格式和 Schema,**不消耗计算资源**。 | +| **流任务 (STREAMING TABLE)** | `CREATE STREAMING TABLE ... AS SELECT` | **持续运行的物理管道**:引擎在后台拉起分布式计算任务,将结果以**纯追加(Append-only)**方式持续写入外部系统。 | +| **事件时间 (Event Time)** | `WATERMARK FOR ` | 引擎内部用于**推进时间进度**、触发窗口结算的**时间基准列**。 | +| **水位线 (Watermark)** | `AS - INTERVAL ...` | 对**迟到、乱序**数据的容忍度;时间推进由水位线驱动,**过度迟到**的事件会被安全丢弃。 | + +> **完整参考**:支持的连接器、数据格式和 SQL 数据类型,请参阅 [连接器、格式与类型参考](connectors-and-formats-zh.md)。 + +--- + + + +## 3. 查询语法结构 + +`CREATE STREAMING TABLE ... AS` 后面是一条**持续运行**的查询,其主体子句顺序与**标准 SQL**一致: + +```text +[ WITH with_query [, ...] ] +SELECT select_expr [, ...] +FROM from_item +[ JOIN join_item [, ...] ] +[ WHERE condition ] +[ GROUP BY grouping_element [, ...] ] +[ HAVING condition ] +``` + +| 子句 | 作用 | +|------|------| +| **`WITH`** | 可选公用表表达式(CTE),便于拆分复杂查询。 | +| **`SELECT` / `FROM`** | 投影与输入关系(目录表、子查询等)。 | +| **`JOIN`** | 多路输入关联(例如窗口双流 JOIN)。 | +| **`WHERE`** | 在聚合或关联**之前**生效的行级过滤。 | +| **`GROUP BY` / `HAVING`** | 分组键与聚合后的过滤;流计算中常与 `TUMBLE(...)`、`HOP(...)`、`SESSION(...)` 产生的窗口列配合使用。 | + +--- + + + +## 4. JOIN 语法与支持 + +流式 JOIN 是实时计算中语义最重的算子之一;Function Stream 通过规划器规则约束**有界状态**与**shuffle 键**。 + +### 4.1 SQL 写法 + +关联紧跟在 `FROM` 之后(或接在前一个 `join_item` 之后),支持连续多表关联: + +```text +from_item + { [ INNER ] JOIN + | LEFT [ OUTER ] JOIN + | RIGHT [ OUTER ] JOIN + | FULL [ OUTER ] JOIN + } from_item + ON join_condition +``` + +`INNER JOIN` 与单独写 `JOIN` 等价;`OUTER` 可省略(`LEFT JOIN` 即 `LEFT OUTER JOIN`)。 + +```text +FROM a +JOIN b ON ... +LEFT JOIN c ON ... +``` + +### 4.2 当前规划器支持的语义矩阵 + +| 业务场景 | 允许的 Join 类型 | 约束与说明 | +|----------|------------------|------------| +| **双流均无窗口**(持续更新型关联) | **仅 `INNER`** | 外连接 `LEFT` / `RIGHT` / `FULL` 会被拒绝:需要**有界状态**(如窗口)。`ON` 须含**至少一组等值条件**。 | +| **两侧具有相同窗口**(时间对齐关联) | **`INNER`、`LEFT`、`RIGHT`、`FULL`** | 两侧窗口定义须**完全一致**。**不支持**以 **`SESSION` 窗口**作为 Join 输入。`ON` 须包含两侧的**同一窗口列**(及业务等值键)。 | +| **混合窗口类型** | — | **不支持**(一侧有窗口、一侧无窗口会被拒绝)。 | + +### 4.3 连接条件(`ON`)说明 + +- 流关联改写器目前仅支持 **`ON join_condition`**;**`USING (...)`** 与**自然连接**在流式计划中**未实现**。 + +针对**无窗口**的双流 **`INNER JOIN`**,还须同时满足: + +1. **必须存在等值键**:规划器收集「左 = 右」用于 **Shuffle / 分区**;若等值键列表为空,规划报错。 +2. **标准等值连接(equi-join)**:由若干组 **`左侧 = 右侧`** 用 **`AND`** 连接。 + +合法示例: + +```sql +-- 单键等值 +ON o.order_id = s.order_id +``` + +```sql +-- 多键等值 +ON o.tenant_id = s.tenant_id AND o.order_id = s.order_id +``` + +不合法(不满足无窗口双流 INNER 规划要求):`ON` 中**仅有**范围比较、缺乏成对 **`左 = 右`** 的等值结构等。 + +对齐窗口下的 `LEFT JOIN` 完整示例见 [场景 4:窗口双流关联](#场景-4窗口双流关联-window-join)。 + +--- + + + +## 实战一:注册数据源 (TABLE) + +从两条典型业务流开始:**广告曝光流**与**广告点击流**。 + +> **核心原则**:必须为输入流声明 **事件时间** 与 **水位线**,这是引擎推进时间的**唯一依据**。 + +```sql +-- 1. 注册广告曝光流 +CREATE TABLE ad_impressions ( + impression_id VARCHAR, + ad_id BIGINT, + campaign_id BIGINT, + user_id VARCHAR, + impression_time TIMESTAMP NOT NULL, + -- 核心:将 impression_time 设为事件时间,并容忍最多 2 秒的数据迟到乱序 + WATERMARK FOR impression_time AS impression_time - INTERVAL '2' SECOND +) WITH ( + 'connector' = 'kafka', + 'topic' = 'raw_ad_impressions', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +); + +-- 2. 注册广告点击流 +CREATE TABLE ad_clicks ( + click_id VARCHAR, + impression_id VARCHAR, + ad_id BIGINT, + click_time TIMESTAMP NOT NULL, + WATERMARK FOR click_time AS click_time - INTERVAL '5' SECOND +) WITH ( + 'connector' = 'kafka', + 'topic' = 'raw_ad_clicks', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +); +``` + +| 要素 | 含义 | +|------|------| +| `WATERMARK FOR <列> AS <列> - INTERVAL 'n' SECOND` | 声明事件时间列与最大可容忍乱序延迟。 | +| `WITH (...)` | 连接器属性:类型、Topic、格式、Broker 等。 | + +--- + + + +## 实战二:构建实时 Pipeline (STREAMING TABLE) + +下面用 **4 个工业界常见场景**,演示 `CREATE STREAMING TABLE ... AS SELECT` 如何落地为实时拓扑。 + + + +### 场景 1:滚动窗口 (Tumbling Window) + +**业务需求**:每 1 分钟统计一次各**广告计划**的曝光总量。 +**特性**:时间轴被切成**固定大小、互不重叠**的桶,例如 `[00:00–00:01)`、`[00:01–00:02)` … + +```sql +CREATE STREAMING TABLE metric_tumble_impressions_1m WITH ( + 'connector' = 'kafka', + 'topic' = 'sink_impressions_1m', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +) AS +SELECT + TUMBLE(INTERVAL '1' MINUTE) AS time_window, + campaign_id, + COUNT(*) AS total_impressions +FROM ad_impressions +GROUP BY + 1, -- 指代 SELECT 中的第一个字段 (time_window) + campaign_id; +``` + + + +### 场景 2:滑动窗口 (Hopping Window) + +**业务需求**:统计**过去 10 分钟**内各广告的独立访客数(UV),且**每 1 分钟**输出一次刷新结果。 +**特性**:窗口**相互重叠**,适合平滑的实时趋势监控。 + +```sql +CREATE STREAMING TABLE metric_hop_uv_10m WITH ( + 'connector' = 'kafka', + 'topic' = 'sink_uv_10m_step_1m', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +) AS +SELECT + HOP(INTERVAL '1' MINUTE, INTERVAL '10' MINUTE) AS time_window, + ad_id, + COUNT(DISTINCT CAST(user_id AS STRING)) AS unique_users +FROM ad_impressions +GROUP BY + 1, + ad_id; +``` + + + +### 场景 3:会话窗口 (Session Window) + +**业务需求**:按用户观察广告曝光**会话**;若该用户 **30 秒**内无新曝光,则视为会话结束并输出统计。 +**特性**:窗口边界由**数据到达疏密**动态决定,适合**行为链路 / 会话分析**。 + +```sql +CREATE STREAMING TABLE metric_session_impressions WITH ( + 'connector' = 'kafka', + 'topic' = 'sink_session_impressions', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +) AS +SELECT + SESSION(INTERVAL '30' SECOND) AS time_window, + user_id, + COUNT(*) AS impressions_in_session +FROM ad_impressions +GROUP BY + 1, + user_id; +``` + + + +### 场景 4:窗口双流关联 (Window Join) + +**业务需求**:联合曝光流与点击流,计算 **5 分钟粒度**点击率相关指标。 +**特性**:两条流在**完全相同**的时间窗口内对齐;水位线越过窗口后状态可回收,**避免无界状态导致 OOM**。 + +```sql +CREATE STREAMING TABLE metric_window_join_ctr_5m WITH ( + 'connector' = 'kafka', + 'topic' = 'sink_ctr_5m', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +) AS +SELECT + imp.time_window, + imp.ad_id, + imp.impressions, + COALESCE(clk.clicks, 0) AS clicks +FROM ( + -- 左流:5 分钟曝光量 + SELECT TUMBLE(INTERVAL '5' MINUTE) AS time_window, ad_id, COUNT(*) AS impressions + FROM ad_impressions + GROUP BY 1, ad_id +) imp +LEFT JOIN ( + -- 右流:5 分钟点击量 + SELECT TUMBLE(INTERVAL '5' MINUTE) AS time_window, ad_id, COUNT(*) AS clicks + FROM ad_clicks + GROUP BY 1, ad_id +) clk +-- 关键:ON 须包含时间窗口列,保证状态有界 +ON imp.time_window = clk.time_window AND imp.ad_id = clk.ad_id; +``` + +> **硬性要求**:关联条件**必须**包含**相同的时间窗口列**,以确保 Join 状态有界。 + +--- + + + +## 实战三:生命周期与流任务管理 + +Function Stream 提供与**目录元数据**、**运行中管道**、**物理拓扑**相关的运维类 SQL。 + + + +### 1. 数据源与元数据管理 + +```sql +-- 已注册的静态表(数据源);结果形态由引擎决定 +SHOW TABLES; + +-- 表定义 / 选项文本;结果形态由引擎决定 +SHOW CREATE TABLE ad_clicks; +``` + + + +### 2. 实时 Pipeline 监控与排障 + +```sql +-- 正在运行的流任务(结果集形态由引擎决定) +SHOW STREAMING TABLES; + +-- 某条管道的物理计划 / 拓扑文本(展示格式由引擎决定) +SHOW CREATE STREAMING TABLE metric_tumble_impressions_1m; +``` + +> **说明**:各类 `SHOW …` 的列名、类型与终端排版可能随版本变化,请以实际 CLI 或服务端返回为准。 + + + +### 3. 安全停止与释放资源 + +```sql +DROP STREAMING TABLE metric_tumble_impressions_1m; +``` + +> **说明**:`DROP STREAMING TABLE` 停止**流计算任务**并释放运行资源;**不会**删除 `CREATE TABLE` 注册的**数据源目录项**,源表可继续被新管道引用。 + +--- + + + +## SQL 语法速查表 + +| 目标操作 | 典型 SQL / 语法要点 | +|----------|---------------------| +| 注册数据源 | `CREATE TABLE name (...) WITH (...)`,并配合 `WATERMARK FOR` | +| 定义事件时间 / 水位线 | `WATERMARK FOR AS - INTERVAL 'n' SECOND` | +| 创建流任务 | `CREATE STREAMING TABLE name WITH (...) AS SELECT ...` | +| 查询子句骨架 | `WITH` → `SELECT` → `FROM` → `[JOIN]` → `[WHERE]` → `[GROUP BY]` → `[HAVING]`(详见 [第 3 节](#3-查询语法结构)) | +| 多流 JOIN | `INNER` / `LEFT` / `RIGHT` / `FULL` + `ON`(约束见 [第 4 节](#4-join-语法与支持)) | +| 时间窗口函数 | `TUMBLE(interval)`、`HOP(slide, size)`、`SESSION(gap)` | +| 查看源表 | `SHOW TABLES`;`SHOW CREATE TABLE ` | +| 监控计算流 | `SHOW STREAMING TABLES` | +| 排查执行图 | `SHOW CREATE STREAMING TABLE ` | +| 停止流任务 | `DROP STREAMING TABLE ` | diff --git a/docs/streaming-sql-guide.md b/docs/streaming-sql-guide.md new file mode 100644 index 00000000..116323eb --- /dev/null +++ b/docs/streaming-sql-guide.md @@ -0,0 +1,410 @@ + + +# Function Stream: Streaming SQL Guide + +[中文](streaming-sql-guide-zh.md) | [English](streaming-sql-guide.md) + +Function Stream provides a declarative SQL interface for building real-time stream processing pipelines. With Streaming SQL you can ingest unbounded data streams, perform time-windowed aggregations, join multiple streams, and manage pipeline lifecycles — **without writing imperative code end to end**. + +### End-to-end data flow + +```mermaid +flowchart LR + subgraph In["Sources (TABLE)"] + S["Kafka / external systems"] + end + subgraph Pipe["Continuous query (STREAMING TABLE)"] + P["SQL logical plan"] + W["Windows / joins / aggregates"] + end + subgraph Out["Outputs"] + K["Sink connector"] + end + S --> P --> W --> K +``` + +> **How to use this guide**: section anchors match the **Guide map** table below. When planning fails, treat the SQL engine error message as authoritative. + +--- + +## Guide map + +| Section | What it covers | +|---------|----------------| +| [1. SQL dialect compatibility](#1-sql-dialect-compatibility) | DataFusion + Function Stream extensions | +| [2. Core concepts](#2-core-concepts) | TABLE / STREAMING TABLE / event time / watermark | +| [3. Query syntax outline](#3-query-syntax-outline) | `WITH` → `SELECT` → `FROM` → `JOIN` → … | +| [4. Join syntax and support](#4-join-syntax-and-support) | Syntax, semantics matrix, `ON` clause details | +| [Hands-on 1: Register sources (TABLE)](#hands-on-1-register-sources-table) | `CREATE TABLE` + `WATERMARK` + `WITH` | +| [Hands-on 2: Build pipelines](#hands-on-2-build-pipelines-streaming-table) | Four full scenario SQL examples | +| [Hands-on 3: Lifecycle & jobs](#hands-on-3-lifecycle-and-streaming-jobs) | `SHOW` / `DROP STREAMING TABLE` | +| [SQL quick reference](#sql-quick-reference) | One-page statement cheat sheet | + +--- + + + +## 1. SQL dialect compatibility + +> **Bottom line**: Function Stream is **“DataFusion SQL + Function Stream streaming DDL extensions.”** + +| Layer | Details | +|-------|---------| +| **Parser & planner** | SQL is parsed with **sqlparser** using the **`FunctionStreamDialect`**, then planned through **Apache DataFusion**’s SQL frontend (`SqlToRel`) and logical planner. | +| **Query (`SELECT …`) syntax** | **DataFusion SQL**: broadly **ANSI-like**, often **PostgreSQL-flavored** (identifiers, common functions, `JOIN` / `WHERE` / `GROUP BY`, etc.). | +| **Compatibility limits** | **Not** full PostgreSQL. Constructs DataFusion rejects, or that streaming rewriters forbid (e.g. global `ORDER BY` / `LIMIT` on **unbounded** queries), fail at **plan time**. | +| **Streaming / catalog DDL** | `WATERMARK FOR`, `CREATE STREAMING TABLE … AS SELECT`, `SHOW STREAMING TABLES`, `DROP STREAMING TABLE`, and connector `WITH ('key' = 'value', …)` on `CREATE TABLE` are **Function Stream–specific** extensions. | + +--- + + + +## 2. Core concepts + +Understand these four ideas before you write streaming SQL: + +| Concept | SQL keyword | Description | +|---------|-------------|-------------| +| **Logical table (TABLE)** | `CREATE TABLE` | A **catalog entry**: static definition of an external source (connection, format, schema). **No compute** is consumed until a pipeline reads it. | +| **Streaming job (STREAMING TABLE)** | `CREATE STREAMING TABLE ... AS SELECT` | A **continuous physical pipeline**: the engine runs distributed tasks and appends results to external systems in **append-only** mode. | +| **Event time** | `WATERMARK FOR ` | The **time basis** the engine uses to advance time and trigger window completion. | +| **Watermark** | `AS - INTERVAL ...` | **Tolerated lateness / disorder**; time advances with watermarks, and **very late** events are dropped safely. | + +> **Full reference** for connectors, formats, and SQL types: [Connectors, Formats & Data Types](connectors-and-formats.md). + +--- + + + +## 3. Query syntax outline + +`CREATE STREAMING TABLE ... AS` wraps a **continuous** query whose clause order matches **standard SQL**: + +```text +[ WITH with_query [, ...] ] +SELECT select_expr [, ...] +FROM from_item +[ JOIN join_item [, ...] ] +[ WHERE condition ] +[ GROUP BY grouping_element [, ...] ] +[ HAVING condition ] +``` + +| Clause | Role | +|--------|------| +| **`WITH`** | Optional CTEs for readability. | +| **`SELECT` / `FROM`** | Projections and inputs (catalog tables, subqueries, etc.). | +| **`JOIN`** | Combine inputs (e.g. aligned window joins). | +| **`WHERE`** | Row filters **before** aggregation / join evaluation. | +| **`GROUP BY` / `HAVING`** | Grouping keys and post-aggregate filters; often used with `TUMBLE(...)`, `HOP(...)`, `SESSION(...)`. | + +Whether a clause is accepted depends on the engine and planner; use the frontend error if planning fails. + +--- + + + +## 4. Join syntax and support + +Streaming joins are among the heaviest operators; Function Stream enforces **bounded state** and **shuffle keys** through planner rules. + +### 4.1 SQL shape + +Joins follow `FROM` (or a prior `join_item`) and can be chained: + +```text +from_item + { [ INNER ] JOIN + | LEFT [ OUTER ] JOIN + | RIGHT [ OUTER ] JOIN + | FULL [ OUTER ] JOIN + } from_item + ON join_condition +``` + +`INNER JOIN` and `JOIN` are equivalent; `OUTER` is optional (`LEFT JOIN` = `LEFT OUTER JOIN`). + +```text +FROM a +JOIN b ON ... +LEFT JOIN c ON ... +``` + +### 4.2 Semantics matrix (current planner) + +| Scenario | Allowed join kinds | Constraints | +|----------|-------------------|-------------| +| **Stream–stream, no window on both sides** (“updating” join) | **`INNER` only** | `LEFT` / `RIGHT` / `FULL` rejected: **bounded** state (e.g. windows) required. `ON` needs **at least one equality** predicate. | +| **Stream–stream, identical tumbling/hopping window on both sides** | **`INNER`**, **`LEFT`**, **`RIGHT`**, **`FULL`** | Window specs must **match exactly**. **`SESSION` windows are not supported** as join inputs. `ON` must include the **same window column** on both sides (plus business equi-keys). | +| **Mixed windowing** | — | **Not supported** (one side windowed, one not). | + +### 4.3 `ON` clause details + +- Only **`ON join_condition`** is implemented in the streaming join rewriter; **`USING (...)`** and **natural joins** are **not** supported on streaming plans. + +For **non-windowed** stream–stream **`INNER JOIN`**, you also need: + +1. **At least one equi-key** — the planner collects `left = right` pairs for **shuffle / partitioning**; an empty equi-key list fails planning. +2. **Equi-join shape** — a conjunction of **`left = right`** predicates joined with **`AND`**. + +Valid examples: + +```sql +-- single-key equality +ON o.order_id = s.order_id +``` + +```sql +-- composite-key equality +ON o.tenant_id = s.tenant_id AND o.order_id = s.order_id +``` + +Invalid for this path: `ON` with **only** range predicates, or no paired **`left = right`** structure. + +A full aligned-window **`LEFT JOIN`** example is in [Scenario 4: Window join](#scenario-4-window-join). + +--- + + + +## Hands-on 1: Registering data sources (TABLE) + +Start from two common streams: **ad impressions** and **ad clicks**. + +> **Rule**: every input stream must declare **event time** and a **watermark** — the **only** basis the engine uses to advance time. + +```sql +-- 1. Register the ad-impressions stream +CREATE TABLE ad_impressions ( + impression_id VARCHAR, + ad_id BIGINT, + campaign_id BIGINT, + user_id VARCHAR, + impression_time TIMESTAMP NOT NULL, + -- Event time + up to 2s out-of-order tolerance + WATERMARK FOR impression_time AS impression_time - INTERVAL '2' SECOND +) WITH ( + 'connector' = 'kafka', + 'topic' = 'raw_ad_impressions', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +); + +-- 2. Register the ad-clicks stream +CREATE TABLE ad_clicks ( + click_id VARCHAR, + impression_id VARCHAR, + ad_id BIGINT, + click_time TIMESTAMP NOT NULL, + WATERMARK FOR click_time AS click_time - INTERVAL '5' SECOND +) WITH ( + 'connector' = 'kafka', + 'topic' = 'raw_ad_clicks', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +); +``` + +| Element | Meaning | +|---------|---------| +| `WATERMARK FOR AS - INTERVAL 'n' SECOND` | Declares event-time column and max tolerated disorder. | +| `WITH (...)` | Connector properties: type, topic, format, brokers, etc. | + +--- + + + +## Hands-on 2: Building real-time pipelines (STREAMING TABLE) + +Four common patterns show how `CREATE STREAMING TABLE ... AS SELECT` becomes a running topology. + + + +### Scenario 1: Tumbling window + +**Goal**: every **1 minute**, count **total impressions per campaign**. +**Behavior**: fixed-size, **non-overlapping** buckets such as `[00:00–00:01)`, `[00:01–00:02)`, … + +```sql +CREATE STREAMING TABLE metric_tumble_impressions_1m WITH ( + 'connector' = 'kafka', + 'topic' = 'sink_impressions_1m', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +) AS +SELECT + TUMBLE(INTERVAL '1' MINUTE) AS time_window, + campaign_id, + COUNT(*) AS total_impressions +FROM ad_impressions +GROUP BY + 1, -- first SELECT column (time_window) + campaign_id; +``` + + + +### Scenario 2: Hopping window + +**Goal**: **UV per ad** over the **last 10 minutes**, **refreshed every 1 minute**. +**Behavior**: **overlapping** windows — good for smoothed real-time trends. + +```sql +CREATE STREAMING TABLE metric_hop_uv_10m WITH ( + 'connector' = 'kafka', + 'topic' = 'sink_uv_10m_step_1m', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +) AS +SELECT + HOP(INTERVAL '1' MINUTE, INTERVAL '10' MINUTE) AS time_window, + ad_id, + COUNT(DISTINCT CAST(user_id AS STRING)) AS unique_users +FROM ad_impressions +GROUP BY + 1, + ad_id; +``` + + + +### Scenario 3: Session window + +**Goal**: per-user **impression sessions**; a session **ends** after **30 seconds** without new impressions. +**Behavior**: window bounds follow **arrival density** — useful for **session / funnel** analytics. + +```sql +CREATE STREAMING TABLE metric_session_impressions WITH ( + 'connector' = 'kafka', + 'topic' = 'sink_session_impressions', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +) AS +SELECT + SESSION(INTERVAL '30' SECOND) AS time_window, + user_id, + COUNT(*) AS impressions_in_session +FROM ad_impressions +GROUP BY + 1, + user_id; +``` + + + +### Scenario 4: Window join + +**Goal**: combine impressions and clicks for **5-minute** CTR-style metrics. +**Behavior**: both streams use the **same** window; when the watermark passes the window, state can be reclaimed — **avoiding unbounded state / OOM**. + +```sql +CREATE STREAMING TABLE metric_window_join_ctr_5m WITH ( + 'connector' = 'kafka', + 'topic' = 'sink_ctr_5m', + 'format' = 'json', + 'bootstrap.servers' = 'localhost:9092' +) AS +SELECT + imp.time_window, + imp.ad_id, + imp.impressions, + COALESCE(clk.clicks, 0) AS clicks +FROM ( + -- Left: 5-minute impressions + SELECT TUMBLE(INTERVAL '5' MINUTE) AS time_window, ad_id, COUNT(*) AS impressions + FROM ad_impressions + GROUP BY 1, ad_id +) imp +LEFT JOIN ( + -- Right: 5-minute clicks + SELECT TUMBLE(INTERVAL '5' MINUTE) AS time_window, ad_id, COUNT(*) AS clicks + FROM ad_clicks + GROUP BY 1, ad_id +) clk +-- ON must include the window column so join state stays bounded +ON imp.time_window = clk.time_window AND imp.ad_id = clk.ad_id; +``` + +> **Hard requirement**: the join predicate **must** include the **same window column** on both sides so join state stays bounded. + +--- + + + +## Hands-on 3: Lifecycle & streaming job management + +Operational SQL for **catalog metadata**, **running pipelines**, and **physical topology**. + + + +### 1. Catalog & metadata + +```sql +-- Registered static tables (sources); result shape is engine-defined +SHOW TABLES; + +-- Table definition / options text; result shape is engine-defined +SHOW CREATE TABLE ad_clicks; +``` + + + +### 2. Monitoring & troubleshooting + +```sql +-- Running streaming jobs (shape of result set is engine-defined) +SHOW STREAMING TABLES; + +-- Physical plan / topology text for one pipeline (format is engine-defined) +SHOW CREATE STREAMING TABLE metric_tumble_impressions_1m; +``` + +> **Note:** Column names, types, and printable layout for `SHOW …` statements may change between releases; use the CLI or server response you get at runtime as the source of truth. + + + +### 3. Safe shutdown & resource release + +```sql +DROP STREAMING TABLE metric_tumble_impressions_1m; +``` + +> **Note**: `DROP STREAMING TABLE` stops the **streaming job** and releases runtime resources; it does **not** remove **`CREATE TABLE`** source catalog entries — sources remain available for new pipelines. + +--- + + + +## SQL quick reference + +| Goal | Typical SQL / syntax | +|------|----------------------| +| Register a source | `CREATE TABLE name (...) WITH (...)` plus `WATERMARK FOR` | +| Event time & watermark | `WATERMARK FOR AS - INTERVAL 'n' SECOND` | +| Create a streaming job | `CREATE STREAMING TABLE name WITH (...) AS SELECT ...` | +| Query skeleton | `WITH` → `SELECT` → `FROM` → `[JOIN]` → `[WHERE]` → `[GROUP BY]` → `[HAVING]` — see [Section 3](#3-query-syntax-outline) | +| Multi-stream joins | `INNER` / `LEFT` / `RIGHT` / `FULL` + `ON` — rules in [Section 4](#4-join-syntax-and-support) | +| Window TVFs | `TUMBLE(interval)`, `HOP(slide, size)`, `SESSION(gap)` | +| Inspect sources | `SHOW TABLES`; `SHOW CREATE TABLE ` | +| Inspect running jobs | `SHOW STREAMING TABLES` | +| Inspect physical plan | `SHOW CREATE STREAMING TABLE ` | +| Stop a streaming job | `DROP STREAMING TABLE ` | diff --git a/protocol/Cargo.toml b/protocol/Cargo.toml index fde9de52..51b1f3c1 100644 --- a/protocol/Cargo.toml +++ b/protocol/Cargo.toml @@ -6,10 +6,14 @@ description = "Protocol Buffers protocol definitions for function stream" license = "MIT OR Apache-2.0" repository = "https://github.com/your-username/rust-function-stream" +# prost/serde are required by generated modules under `generated/`; machete does not scan those paths. +[package.metadata.cargo-machete] +ignored = ["prost", "serde"] + [dependencies] prost = "0.13" tonic = { version = "0.12", features = ["default"] } -log = "0.4" +serde = { version = "1.0", features = ["derive"] } [build-dependencies] tonic-build = "0.12" diff --git a/protocol/build.rs b/protocol/build.rs index 17e77d30..e8fc94c9 100644 --- a/protocol/build.rs +++ b/protocol/build.rs @@ -10,54 +10,65 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::path::Path; +use std::path::{Path, PathBuf}; fn main() -> Result<(), Box> { - // Initialize logger for build script env_logger::init(); - // Create output directories in the protocol package directory - // Use CARGO_MANIFEST_DIR to get the package root directory let manifest_dir = std::env::var("CARGO_MANIFEST_DIR")?; let out_dir = Path::new(&manifest_dir).join("generated"); - let proto_file = Path::new(&manifest_dir).join("proto/function_stream.proto"); - - // Note: Cargo doesn't directly support cleaning custom directories via cargo clean. - // The generated directory will be automatically regenerated on each build if needed. - // To clean it manually, use: ./clean.sh or make clean or rm -rf protocol/generated log::info!("Generated code will be placed in: {}", out_dir.display()); - log::info!("Proto file: {}", proto_file.display()); - // Create output directories let cli_dir = out_dir.join("cli"); let service_dir = out_dir.join("service"); std::fs::create_dir_all(&cli_dir)?; std::fs::create_dir_all(&service_dir)?; - log::info!( - "Created output directories: {} and {}", - cli_dir.display(), - service_dir.display() - ); - // Generate code for CLI - only client code needed + // 1. function_stream.proto → CLI (client) and Service (server) tonic_build::configure() .out_dir(&cli_dir) - .build_client(true) // Enable client code generation - .build_server(false) // Disable server code generation + .build_client(true) + .build_server(false) .compile_protos(&["proto/function_stream.proto"], &["proto"])?; - // Generate code for Service - only server code needed tonic_build::configure() .out_dir(&service_dir) - .build_client(false) // Disable client code generation - .build_server(true) // Enable server code generation + .build_client(false) + .build_server(true) .compile_protos(&["proto/function_stream.proto"], &["proto"])?; + let api_dir = out_dir.join("api"); + std::fs::create_dir_all(&api_dir)?; + + let descriptor_path = + PathBuf::from(std::env::var("OUT_DIR").unwrap()).join("fs_api_descriptor.bin"); + + tonic_build::configure() + .out_dir(&api_dir) + .protoc_arg("--experimental_allow_proto3_optional") + .file_descriptor_set_path(&descriptor_path) + .type_attribute(".", "#[derive(serde::Serialize, serde::Deserialize)]") + .type_attribute(".", "#[serde(rename_all = \"camelCase\")]") + .build_client(false) + .build_server(false) + .compile_protos(&["proto/function_stream_graph.proto"], &["proto"])?; + + let storage_dir = out_dir.join("storage"); + std::fs::create_dir_all(&storage_dir)?; + tonic_build::configure() + .out_dir(&storage_dir) + .protoc_arg("--experimental_allow_proto3_optional") + .build_client(false) + .build_server(false) + .compile_protos(&["proto/storage.proto"], &["proto"])?; + log::info!("Protocol Buffers code generated successfully"); println!("cargo:rustc-env=PROTO_GEN_DIR={}", out_dir.display()); - println!("cargo:rerun-if-changed={}", proto_file.display()); + println!("cargo:rerun-if-changed=proto/function_stream.proto"); + println!("cargo:rerun-if-changed=proto/function_stream_graph.proto"); + println!("cargo:rerun-if-changed=proto/storage.proto"); Ok(()) } diff --git a/protocol/proto/function_stream_graph.proto b/protocol/proto/function_stream_graph.proto new file mode 100644 index 00000000..48b68a2f --- /dev/null +++ b/protocol/proto/function_stream_graph.proto @@ -0,0 +1,477 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Streaming pipeline wire types (FsProgram) and optional job graph metadata. +syntax = "proto3"; + +package function_stream.v1; + +// ============================================================================= +// Topology & Execution Graph +// ============================================================================= + +message StreamingJobGraph { + string job_id = 1; + string job_name = 2; + repeated ExecutionNode nodes = 3; + repeated ExecutionEdge edges = 4; + FsProgramConfig program_config = 5; +} + +message ExecutionNode { + uint32 node_id = 1; + uint32 parallelism = 2; + string label = 3; + repeated ChainedOperator operator_chain = 4; + repeated QualifiedSchema output_schemas = 5; +} + +message ExecutionEdge { + uint32 source_node_id = 1; + uint32 target_node_id = 2; + QualifiedSchema schema = 3; + RoutingStrategy strategy = 4; +} + +enum RoutingStrategy { + ROUTING_STRATEGY_UNSPECIFIED = 0; + ROUTING_STRATEGY_FORWARD = 1; + ROUTING_STRATEGY_SHUFFLE = 2; + ROUTING_STRATEGY_BROADCAST = 3; + ROUTING_STRATEGY_REBALANCE = 4; +} + +message QualifiedSchema { + string arrow_schema_serialized = 1; + uint32 event_time_column_index = 2; + repeated uint32 state_key_indices = 3; + repeated uint32 distribution_key_indices = 4; +} + + +// ─────────────────────── Operators ─────────────────────── + +message ConnectorOp { + string connector = 1; + reserved 2; // removed: map config_map + optional FsSchema fs_schema = 3; + string name = 4; + string description = 5; + + oneof config { + KafkaSourceConfig kafka_source = 6; + KafkaSinkConfig kafka_sink = 7; + GenericConnectorConfig generic = 8; + } +} + +// ─────────────────────── Kafka Connector Configs ─────────────────────── + +message KafkaSourceConfig { + string topic = 1; + string bootstrap_servers = 2; + optional string group_id = 3; + optional string group_id_prefix = 4; + KafkaOffsetMode offset_mode = 5; + KafkaReadMode read_mode = 6; + KafkaAuthConfig auth = 7; + map client_configs = 8; + FormatConfig format = 9; + BadDataPolicy bad_data_policy = 10; + uint32 rate_limit_msgs_per_sec = 11; + optional string value_subject = 12; +} + +message KafkaSinkConfig { + string topic = 1; + string bootstrap_servers = 2; + KafkaSinkCommitMode commit_mode = 3; + optional string key_field = 4; + optional string timestamp_field = 5; + KafkaAuthConfig auth = 6; + map client_configs = 7; + FormatConfig format = 8; + optional string value_subject = 9; +} + +// Fallback for non-Kafka connectors that are not yet strongly typed. +message GenericConnectorConfig { + map properties = 1; +} + +// ─────────────────────── Kafka Auth ─────────────────────── + +message KafkaAuthConfig { + oneof auth { + KafkaAuthNone none = 1; + KafkaAuthSasl sasl = 2; + KafkaAuthAwsMskIam aws_msk_iam = 3; + } +} + +message KafkaAuthNone {} + +message KafkaAuthSasl { + string protocol = 1; + string mechanism = 2; + string username = 3; + string password = 4; +} + +message KafkaAuthAwsMskIam { + string region = 1; +} + +// ─────────────────────── Format & Data-Quality ─────────────────────── + +message FormatConfig { + oneof format { + JsonFormatConfig json = 1; + RawStringFormatConfig raw_string = 2; + RawBytesFormatConfig raw_bytes = 3; + } +} + +message JsonFormatConfig { + TimestampFormatProto timestamp_format = 1; + DecimalEncodingProto decimal_encoding = 2; + bool include_schema = 3; + bool confluent_schema_registry = 4; + optional uint32 schema_id = 5; + bool debezium = 6; + bool unstructured = 7; +} + +message RawStringFormatConfig {} +message RawBytesFormatConfig {} + +// ─────────────────────── Kafka Enums ─────────────────────── + +enum TimestampFormatProto { + TIMESTAMP_RFC3339 = 0; + TIMESTAMP_UNIX_MILLIS = 1; +} + +enum DecimalEncodingProto { + DECIMAL_NUMBER = 0; + DECIMAL_STRING = 1; + DECIMAL_BYTES = 2; +} + +enum BadDataPolicy { + BAD_DATA_FAIL = 0; + BAD_DATA_DROP = 1; +} + +enum KafkaOffsetMode { + KAFKA_OFFSET_EARLIEST = 0; + KAFKA_OFFSET_LATEST = 1; + KAFKA_OFFSET_GROUP = 2; +} + +enum KafkaReadMode { + KAFKA_READ_DEFAULT = 0; + KAFKA_READ_COMMITTED = 1; + KAFKA_READ_UNCOMMITTED = 2; +} + +enum KafkaSinkCommitMode { + KAFKA_SINK_AT_LEAST_ONCE = 0; + KAFKA_SINK_EXACTLY_ONCE = 1; +} + +message ValuePlanOperator { + string name = 1; + bytes physical_plan = 2; +} + +message KeyPlanOperator { + string name = 1; + bytes physical_plan = 2; + repeated uint64 key_fields = 3; +} + +message ProjectionOperator { + string name = 1; + FsSchema input_schema = 2; + FsSchema output_schema = 3; + repeated bytes exprs = 4; +} + +message TumblingWindowAggregateOperator { + string name = 1; + uint64 width_micros = 2; + bytes binning_function = 3; + FsSchema input_schema = 4; + FsSchema partial_schema = 5; + bytes partial_aggregation_plan = 6; + bytes final_aggregation_plan = 7; + optional bytes final_projection = 8; +} + +message SlidingWindowAggregateOperator { + string name = 1; + uint64 width_micros = 2; + uint64 slide_micros = 3; + bytes binning_function = 4; + FsSchema input_schema = 5; + FsSchema partial_schema = 6; + bytes partial_aggregation_plan = 7; + bytes final_aggregation_plan = 8; + bytes final_projection = 9; +} + +message SessionWindowAggregateOperator { + string name = 1; + uint64 gap_micros = 2; + string window_field_name = 3; + uint64 window_index = 4; + FsSchema input_schema = 5; + FsSchema unkeyed_aggregate_schema = 6; + bytes partial_aggregation_plan = 7; + bytes final_aggregation_plan = 8; +} + +message JoinOperator { + string name = 1; + FsSchema left_schema = 2; + FsSchema right_schema = 3; + FsSchema output_schema = 4; + bytes join_plan = 5; + optional uint64 ttl_micros = 6; +} + +message LookupJoinCondition { + bytes left_expr = 1; + string right_key = 2; +} + +message LookupJoinOperator { + FsSchema input_schema = 1; + FsSchema lookup_schema = 2; + ConnectorOp connector = 3; + repeated LookupJoinCondition key_exprs = 4; + JoinType join_type = 5; + optional uint64 ttl_micros = 6; + optional uint64 max_capacity_bytes = 7; +} + +message WindowFunctionOperator { + string name = 1; + FsSchema input_schema = 2; + bytes binning_function = 3; + bytes window_function_plan = 4; +} + +enum AsyncUdfOrdering { + UNORDERED = 0; + ORDERED = 1; +} + +message AsyncUdfOperator { + string name = 1; + DylibUdfConfig udf = 2; + repeated bytes arg_exprs = 3; + repeated bytes final_exprs = 4; + AsyncUdfOrdering ordering = 5; + uint32 max_concurrency = 6; + uint64 timeout_micros = 7; +} + +message UpdatingAggregateOperator { + string name = 1; + FsSchema input_schema = 2; + FsSchema final_schema = 3; + bytes aggregate_exec = 5; + bytes metadata_expr = 6; + uint64 flush_interval_micros = 7; + uint64 ttl_micros = 8; +} + +// ─────────────────────── Watermark ─────────────────────── + +message ExpressionWatermarkConfig { + uint64 period_micros = 1; + optional uint64 idle_time_micros = 2; + FsSchema input_schema = 3; + bytes expression = 4; +} + +// ─────────────────────── Windows ─────────────────────── + +message Window { + oneof window { + SlidingWindow sliding_window = 2; + TumblingWindow tumbling_window = 3; + InstantWindow instant_window = 4; + SessionWindow session_window = 5; + } +} + +message SlidingWindow { + uint64 size_micros = 1; + uint64 slide_micros = 2; +} + +message TumblingWindow { + uint64 size_micros = 1; +} + +message InstantWindow {} + +message SessionWindow { + uint64 gap_micros = 1; +} + +// ─────────────────────── Enums ─────────────────────── + +enum JoinType { + INNER = 0; + LEFT = 1; + RIGHT = 2; + FULL = 3; +} + +enum OffsetMode { + EARLIEST = 0; + LATEST = 1; +} + +enum EdgeType { + UNUSED = 0; + FORWARD = 1; + SHUFFLE = 2; + LEFT_JOIN = 3; + RIGHT_JOIN = 4; +} + +// ─────────────────── Physical Extension Nodes ─────────────────── + +message MemExecNode { + string table_name = 1; + string schema = 2; // json-encoded +} + +message UnnestExecNode { + string schema = 1; // json-encoded +} + +message DebeziumDecodeNode { + string schema = 1; // json-encoded + repeated uint64 primary_keys = 2; +} + +message DebeziumEncodeNode { + string schema = 1; // json-encoded +} + +message FsExecNode { + oneof node { + MemExecNode mem_exec = 1; + UnnestExecNode unnest_exec = 2; + DebeziumDecodeNode debezium_decode = 3; + DebeziumEncodeNode debezium_encode = 4; + } +} + +// ─────────────────── Checkpoints ─────────────────── + +enum TaskCheckpointEventType { + ALIGNMENT_STARTED = 0; + CHECKPOINT_STARTED = 1; + CHECKPOINT_OPERATOR_SETUP_FINISHED = 2; + CHECKPOINT_SYNC_FINISHED = 3; + CHECKPOINT_PRE_COMMIT = 4; +} + +message TaskCheckpointEvent { + uint64 time = 1; + TaskCheckpointEventType event_type = 2; +} + +message TaskCheckpointDetail { + uint32 subtask_index = 1; + uint64 start_time = 2; + optional uint64 finish_time = 3; + optional uint64 bytes = 4; + repeated TaskCheckpointEvent events = 5; +} + +message OperatorCheckpointDetail { + string operator_id = 1; + uint64 start_time = 2; + optional uint64 finish_time = 3; + bool has_state = 4; + optional uint64 started_metadata_write = 6; + map tasks = 5; +} + +// ─────────────────── UDF Config ─────────────────── + +message DylibUdfConfig { + string dylib_path = 1; + repeated bytes arg_types = 2; + bytes return_type = 3; + bool aggregate = 4; + bool is_async = 5; +} + +message PythonUdfConfig { + string name = 1; + repeated bytes arg_types = 2; + bytes return_type = 3; + string definition = 4; +} + +message FsProgramConfig { + map udf_dylibs = 1; + map python_udfs = 2; +} + +// ─────────────────── Arrow Program ─────────────────── + +message FsProgram { + repeated FsNode nodes = 1; + repeated FsEdge edges = 2; + FsProgramConfig program_config = 3; +} + +message FsSchema { + string arrow_schema = 1; // json-encoded Arrow Schema + uint32 timestamp_index = 2; + repeated uint32 key_indices = 3; + bool has_keys = 4; + repeated uint32 routing_key_indices = 5; + bool has_routing_keys = 6; +} + +message ChainedOperator { + string operator_id = 1; + string operator_name = 2; + bytes operator_config = 3; +} + +message FsNode { + int32 node_index = 1; + uint32 node_id = 2; + uint32 parallelism = 3; + string description = 4; + repeated ChainedOperator operators = 5; + repeated FsSchema edges = 6; +} + +message FsEdge { + int32 source = 1; + int32 target = 2; + FsSchema schema = 4; + EdgeType edge_type = 5; +} diff --git a/protocol/proto/storage.proto b/protocol/proto/storage.proto new file mode 100644 index 00000000..d7caf7bc --- /dev/null +++ b/protocol/proto/storage.proto @@ -0,0 +1,107 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// +// All durable / persisted payloads for FunctionStream (single source of truth for storage wire format). +// - Stream table catalog (MetaStore KV) +// - Task rows (RocksDB task_meta / task_payload; values may be prefixed — see runtime codec) + +syntax = "proto3"; + +package function_stream.storage; + +// ============================================================================= +// Catalog table storage (coordinator SQL catalog) +// ============================================================================= + +// Top-level persisted record for one catalog table. +message TableDefinition { + string table_name = 1; + int64 updated_at_millis = 2; + oneof table_type { + // Connector-backed ingestion/egress table definition. + CatalogSourceTable connector_table = 3; + // Connector-backed lookup table definition. + CatalogSourceTable lookup_table = 5; + } +} + +// Shared connector-backed table payload for connector/lookup entries. +message CatalogSourceTable { + bytes arrow_schema_ipc = 1; + optional string event_time_field = 2; + optional string watermark_field = 3; + // Original CREATE TABLE ... WITH ('k'='v', ...) pairs — single source of truth. + map with_options = 4; + // Canonical connector identifier (e.g. kafka, postgres-cdc). + string connector = 5; + reserved 6; // removed: string opaque_config (JSON blob no longer needed) + // Human-readable note from DDL (ConnectorOp.description). + string description = 7; +} + +// ============================================================================= +// Streaming table storage (CREATE STREAMING TABLE persistence) +// ============================================================================= + +// Persisted record for one streaming table (CREATE STREAMING TABLE). +// On restart, the engine re-submits each record to JobManager to resume the pipeline. +message StreamingTableDefinition { + string table_name = 1; + int64 created_at_millis = 2; + // Serialized function_stream.v1.FsProgram — the full execution graph. + // Stored as opaque bytes to avoid coupling storage schema with runtime API protos. + bytes fs_program_bytes = 3; + string comment = 4; +} + +// ============================================================================= +// Task storage (RocksDB metadata + module payload) +// ============================================================================= + +// Lifecycle state persisted for task recovery. New enum values MUST be appended +// with new numbers (never renumber) for forward compatibility. +enum ComponentStateKind { + COMPONENT_STATE_KIND_UNSPECIFIED = 0; + UNINITIALIZED = 1; + INITIALIZED = 2; + STARTING = 3; + RUNNING = 4; + CHECKPOINTING = 5; + STOPPING = 6; + STOPPED = 7; + CLOSING = 8; + CLOSED = 9; + ERROR = 10; +} + +message ComponentStateProto { + ComponentStateKind kind = 1; + // Set when kind == ERROR + string error_message = 2; +} + +// Stored in CF task_meta (after magic prefix FSP1). +message TaskMetadataProto { + string task_type = 1; + ComponentStateProto state = 2; + uint64 created_at = 3; + optional uint64 checkpoint_id = 4; +} + +message TaskModuleWasm { + bytes wasm_binary = 1; +} + +message TaskModulePython { + string class_name = 1; + string module_path = 2; + optional bytes embedded_code = 3; +} + +// Stored in CF task_payload (after magic prefix FSP1). +message TaskModulePayloadProto { + oneof payload { + TaskModuleWasm wasm = 1; + TaskModulePython python = 2; + } +} diff --git a/protocol/src/lib.rs b/protocol/src/lib.rs index b0c6da06..a1bff4a5 100644 --- a/protocol/src/lib.rs +++ b/protocol/src/lib.rs @@ -10,25 +10,28 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Protocol Buffers protocol definitions for function stream -// This module exports the generated Protocol Buffers code - -// CLI module - exports client code #[path = "../generated/cli/function_stream.rs"] pub mod cli; -// Service module - exports server code #[path = "../generated/service/function_stream.rs"] pub mod service; -// Re-export commonly used types from both modules -// Data structures are the same in both, so we can re-export from either pub use cli::function_stream_service_client; - -// Re-export client-specific types pub use cli::function_stream_service_client::FunctionStreamServiceClient; - -// Re-export server-specific types pub use service::function_stream_service_server::{ FunctionStreamService, FunctionStreamServiceServer, }; + +pub mod function_stream_graph { + #![allow(clippy::all)] + include!("../generated/api/function_stream.v1.rs"); +} + +pub const FS_API_FILE_DESCRIPTOR_SET: &[u8] = + tonic::include_file_descriptor_set!("fs_api_descriptor"); + +pub mod storage { + #![allow(clippy::all)] + #![allow(warnings)] + include!("../generated/storage/function_stream.storage.rs"); +} diff --git a/src/common/fs_schema.rs b/src/common/fs_schema.rs new file mode 100644 index 00000000..342b7c57 --- /dev/null +++ b/src/common/fs_schema.rs @@ -0,0 +1,452 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion::arrow::array::builder::{ArrayBuilder, make_builder}; +use datafusion::arrow::array::{RecordBatch, TimestampNanosecondArray}; +use datafusion::arrow::datatypes::{DataType, Field, FieldRef, Schema, SchemaBuilder, TimeUnit}; +use datafusion::arrow::error::ArrowError; +use datafusion::common::{DataFusionError, Result as DFResult}; +use std::sync::Arc; +use std::time::SystemTime; +use arrow::compute::{filter_record_batch, lexsort_to_indices, partition, take, SortColumn}; +use arrow::compute::kernels::cmp::gt_eq; +use arrow::compute::kernels::numeric::div; +use arrow::row::SortField; +use arrow_array::{PrimitiveArray, UInt64Array}; +use arrow_array::types::UInt64Type; +use protocol::function_stream_graph; +use super::{to_nanos, TIMESTAMP_FIELD}; +use std::ops::Range; +use crate::common::converter::Converter; + +pub type FsSchemaRef = Arc; + +#[derive(Debug, Clone, Eq, PartialEq, Hash)] +pub struct FsSchema { + pub schema: Arc, + pub timestamp_index: usize, + key_indices: Option>, + /// If defined, these indices are used for routing (i.e., which subtask gets which piece of data) + routing_key_indices: Option>, +} + +impl TryFrom for FsSchema { + type Error = DataFusionError; + fn try_from(schema_proto: function_stream_graph::FsSchema) -> Result { + let schema: Schema = serde_json::from_str(&schema_proto.arrow_schema) + .map_err(|e| DataFusionError::Plan(format!("Invalid arrow schema: {e}")))?; + let timestamp_index = schema_proto.timestamp_index as usize; + + let key_indices = schema_proto.has_keys.then(|| { + schema_proto + .key_indices + .into_iter() + .map(|index| index as usize) + .collect() + }); + + let routing_key_indices = schema_proto.has_routing_keys.then(|| { + schema_proto + .routing_key_indices + .into_iter() + .map(|index| index as usize) + .collect() + }); + + Ok(Self { + schema: Arc::new(schema), + timestamp_index, + key_indices, + routing_key_indices, + }) + } +} + +impl From for function_stream_graph::FsSchema { + fn from(schema: FsSchema) -> Self { + let arrow_schema = serde_json::to_string(schema.schema.as_ref()).unwrap(); + let timestamp_index = schema.timestamp_index as u32; + + let has_keys = schema.key_indices.is_some(); + let key_indices = schema + .key_indices + .map(|ks| ks.into_iter().map(|index| index as u32).collect()) + .unwrap_or_default(); + + let has_routing_keys = schema.routing_key_indices.is_some(); + let routing_key_indices = schema + .routing_key_indices + .map(|ks| ks.into_iter().map(|index| index as u32).collect()) + .unwrap_or_default(); + + Self { + arrow_schema, + timestamp_index, + key_indices, + has_keys, + routing_key_indices, + has_routing_keys, + } + } +} + +impl FsSchema { + pub fn new( + schema: Arc, + timestamp_index: usize, + key_indices: Option>, + routing_key_indices: Option>, + ) -> Self { + Self { + schema, + timestamp_index, + key_indices, + routing_key_indices, + } + } + pub fn new_unkeyed(schema: Arc, timestamp_index: usize) -> Self { + Self { + schema, + timestamp_index, + key_indices: None, + routing_key_indices: None, + } + } + pub fn new_keyed(schema: Arc, timestamp_index: usize, key_indices: Vec) -> Self { + Self { + schema, + timestamp_index, + key_indices: Some(key_indices), + routing_key_indices: None, + } + } + + pub fn from_fields(mut fields: Vec) -> Self { + if !fields.iter().any(|f| f.name() == TIMESTAMP_FIELD) { + fields.push(Field::new( + TIMESTAMP_FIELD, + DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + )); + } + + Self::from_schema_keys(Arc::new(Schema::new(fields)), vec![]).unwrap() + } + + pub fn from_schema_unkeyed(schema: Arc) -> DFResult { + let timestamp_index = schema + .column_with_name(TIMESTAMP_FIELD) + .ok_or_else(|| { + DataFusionError::Plan(format!( + "no {TIMESTAMP_FIELD} field in schema, schema is {schema:?}" + )) + })? + .0; + + Ok(Self { + schema, + timestamp_index, + key_indices: None, + routing_key_indices: None, + }) + } + + pub fn from_schema_keys(schema: Arc, key_indices: Vec) -> DFResult { + let timestamp_index = schema + .column_with_name(TIMESTAMP_FIELD) + .ok_or_else(|| { + DataFusionError::Plan(format!( + "no {TIMESTAMP_FIELD} field in schema, schema is {schema:?}" + )) + })? + .0; + + Ok(Self { + schema, + timestamp_index, + key_indices: Some(key_indices), + routing_key_indices: None, + }) + } + + pub fn schema_without_timestamp(&self) -> Schema { + let mut builder = SchemaBuilder::from(self.schema.fields()); + builder.remove(self.timestamp_index); + builder.finish() + } + + pub fn remove_timestamp_column(&self, batch: &mut RecordBatch) { + batch.remove_column(self.timestamp_index); + } + + pub fn builders(&self) -> Vec> { + self.schema + .fields + .iter() + .map(|f| make_builder(f.data_type(), 8)) + .collect() + } + + pub fn timestamp_column<'a>(&self, batch: &'a RecordBatch) -> &'a TimestampNanosecondArray { + batch + .column(self.timestamp_index) + .as_any() + .downcast_ref::() + .unwrap() + } + + pub fn has_routing_keys(&self) -> bool { + self.routing_keys().map(|k| !k.is_empty()).unwrap_or(false) + } + + pub fn routing_keys(&self) -> Option<&Vec> { + self.routing_key_indices + .as_ref() + .or(self.key_indices.as_ref()) + } + + pub fn storage_keys(&self) -> Option<&Vec> { + self.key_indices.as_ref() + } + + pub fn filter_by_time( + &self, + batch: RecordBatch, + cutoff: Option, + ) -> Result { + let Some(cutoff) = cutoff else { + // no watermark, so we just return the same batch. + return Ok(batch); + }; + // filter out late data + let timestamp_column = batch + .column(self.timestamp_index) + .as_any() + .downcast_ref::() + .ok_or_else(|| ArrowError::CastError( + format!("failed to downcast column {} of {:?} to timestamp. Schema is supposed to be {:?}", + self.timestamp_index, batch, self.schema)))?; + let cutoff_scalar = TimestampNanosecondArray::new_scalar(to_nanos(cutoff) as i64); + let on_time = gt_eq(timestamp_column, &cutoff_scalar)?; + filter_record_batch(&batch, &on_time) + } + + pub fn sort_columns(&self, batch: &RecordBatch, with_timestamp: bool) -> Vec { + let mut columns = vec![]; + if let Some(keys) = &self.key_indices { + columns.extend(keys.iter().map(|index| SortColumn { + values: batch.column(*index).clone(), + options: None, + })); + } + if with_timestamp { + columns.push(SortColumn { + values: batch.column(self.timestamp_index).clone(), + options: None, + }); + } + columns + } + + pub fn sort_fields(&self, with_timestamp: bool) -> Vec { + let mut sort_fields = vec![]; + if let Some(keys) = &self.key_indices { + sort_fields.extend(keys.iter()); + } + if with_timestamp { + sort_fields.push(self.timestamp_index); + } + self.sort_fields_by_indices(&sort_fields) + } + + fn sort_fields_by_indices(&self, indices: &[usize]) -> Vec { + indices + .iter() + .map(|index| SortField::new(self.schema.field(*index).data_type().clone())) + .collect() + } + + pub fn converter(&self, with_timestamp: bool) -> Result { + Converter::new(self.sort_fields(with_timestamp)) + } + + pub fn value_converter( + &self, + with_timestamp: bool, + generation_index: usize, + ) -> Result { + match &self.key_indices { + None => { + let mut indices = (0..self.schema.fields().len()).collect::>(); + indices.remove(generation_index); + if !with_timestamp { + indices.remove(self.timestamp_index); + } + Converter::new(self.sort_fields_by_indices(&indices)) + } + Some(keys) => { + let indices = (0..self.schema.fields().len()) + .filter(|index| { + !keys.contains(index) + && (with_timestamp || *index != self.timestamp_index) + && *index != generation_index + }) + .collect::>(); + Converter::new(self.sort_fields_by_indices(&indices)) + } + } + } + + pub fn value_indices(&self, with_timestamp: bool) -> Vec { + let field_count = self.schema.fields().len(); + match &self.key_indices { + None => { + let mut indices = (0..field_count).collect::>(); + + if !with_timestamp { + indices.remove(self.timestamp_index); + } + indices + } + Some(keys) => (0..field_count) + .filter(|index| { + !keys.contains(index) && (with_timestamp || *index != self.timestamp_index) + }) + .collect::>(), + } + } + + pub fn sort( + &self, + batch: RecordBatch, + with_timestamp: bool, + ) -> Result { + if self.key_indices.is_none() && !with_timestamp { + return Ok(batch); + } + let sort_columns = self.sort_columns(&batch, with_timestamp); + let sort_indices = lexsort_to_indices(&sort_columns, None).expect("should be able to sort"); + let columns = batch + .columns() + .iter() + .map(|c| take(c, &sort_indices, None).unwrap()) + .collect(); + + RecordBatch::try_new(batch.schema(), columns) + } + + pub fn partition( + &self, + batch: &RecordBatch, + with_timestamp: bool, + ) -> Result>, ArrowError> { + if self.key_indices.is_none() && !with_timestamp { + #[allow(clippy::single_range_in_vec_init)] + return Ok(vec![0..batch.num_rows()]); + } + + let mut partition_columns = vec![]; + + if let Some(keys) = &self.routing_keys() { + partition_columns.extend(keys.iter().map(|index| batch.column(*index).clone())); + } + if with_timestamp { + partition_columns.push(batch.column(self.timestamp_index).clone()); + } + + Ok(partition(&partition_columns)?.ranges()) + } + + pub fn unkeyed_batch(&self, batch: &RecordBatch) -> Result { + if self.key_indices.is_none() { + return Ok(batch.clone()); + } + let columns: Vec<_> = (0..batch.num_columns()) + .filter(|index| !self.key_indices.as_ref().unwrap().contains(index)) + .collect(); + batch.project(&columns) + } + + pub fn schema_without_keys(&self) -> Result { + if self.key_indices.is_none() { + return Ok(self.clone()); + } + let key_indices = self.key_indices.as_ref().unwrap(); + let unkeyed_schema = Schema::new( + self.schema + .fields() + .iter() + .enumerate() + .filter(|(index, _field)| !key_indices.contains(index)) + .map(|(_, field)| field.as_ref().clone()) + .collect::>(), + ); + let timestamp_index = unkeyed_schema.index_of(TIMESTAMP_FIELD)?; + Ok(Self { + schema: Arc::new(unkeyed_schema), + timestamp_index, + key_indices: None, + routing_key_indices: None, + }) + } + + pub fn with_fields(&self, fields: Vec) -> Result { + let schema = Arc::new(Schema::new_with_metadata( + fields, + self.schema.metadata.clone(), + )); + + let timestamp_index = schema.index_of(TIMESTAMP_FIELD)?; + let max_index = *[&self.key_indices, &self.routing_key_indices] + .iter() + .map(|indices| indices.as_ref().and_then(|k| k.iter().max())) + .max() + .flatten() + .unwrap_or(&0); + + if schema.fields.len() - 1 < max_index { + return Err(ArrowError::InvalidArgumentError(format!( + "expected at least {} fields, but were only {}", + max_index + 1, + schema.fields.len() + ))); + } + + Ok(Self { + schema, + timestamp_index, + key_indices: self.key_indices.clone(), + routing_key_indices: self.routing_key_indices.clone(), + }) + } + + pub fn with_additional_fields( + &self, + new_fields: impl Iterator, + ) -> Result { + let mut fields = self.schema.fields.to_vec(); + fields.extend(new_fields.map(Arc::new)); + + self.with_fields(fields) + } +} + +pub fn server_for_hash_array( + hash: &PrimitiveArray, + n: usize, +) -> Result, ArrowError> { + let range_size = u64::MAX / (n as u64) + 1; + let range_scalar = UInt64Array::new_scalar(range_size); + let division = div(hash, &range_scalar)?; + let result: &PrimitiveArray = division.as_any().downcast_ref().unwrap(); + Ok(result.clone()) +} diff --git a/src/common/mod.rs b/src/common/mod.rs new file mode 100644 index 00000000..e0eb8d7a --- /dev/null +++ b/src/common/mod.rs @@ -0,0 +1,72 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Shared core types and constants for FunctionStream (`crate::common`). +//! +//! Used by the runtime, SQL planner, coordinator, and other subsystems — +//! analogous to `arroyo-types` + `arroyo-rpc` in Arroyo. + +pub mod arrow_ext; +pub mod control; +pub mod date; +pub mod debezium; +pub mod fs_schema; +pub mod errors; +pub mod formats; +pub mod hash; +pub mod message; +pub mod operator_config; +pub mod task_info; +pub mod time_utils; +pub mod worker; +mod converter; + +// ── Re-exports from existing modules ── +pub use arrow_ext::{DisplayAsSql, FsExtensionType, GetArrowSchema, GetArrowType}; +pub use date::{DatePart, DateTruncPrecision}; +pub use debezium::{Debezium, DebeziumOp, UpdatingData}; +pub use hash::{range_for_server, server_for_hash, HASH_SEEDS}; +pub use message::{ArrowMessage, CheckpointBarrier, SignalMessage, Watermark}; +pub use task_info::{ChainInfo, TaskInfo}; +pub use time_utils::{from_micros, from_millis, from_nanos, to_micros, to_millis, to_nanos}; +pub use worker::{MachineId, WorkerId}; + +// ── Re-exports from new modules ── +pub use control::{ + CheckpointCompleted, CheckpointEvent, CompactionResult, ControlMessage, ControlResp, + ErrorDomain, RetryHint, StopMode, TaskCheckpointEventType, TaskError, +}; +pub use fs_schema::{FsSchema, FsSchemaRef}; +pub use errors::DataflowError; +pub use formats::{BadData, Format, Framing, JsonFormat}; +pub use operator_config::MetadataField; + +// ── Well-known column names ── +pub const TIMESTAMP_FIELD: &str = "_timestamp"; +pub const UPDATING_META_FIELD: &str = "_updating_meta"; + +// ── Environment variables ── +pub const JOB_ID_ENV: &str = "JOB_ID"; +pub const RUN_ID_ENV: &str = "RUN_ID"; + +// ── Metric names ── +pub const MESSAGES_RECV: &str = "fs_worker_messages_recv"; +pub const MESSAGES_SENT: &str = "fs_worker_messages_sent"; +pub const BYTES_RECV: &str = "fs_worker_bytes_recv"; +pub const BYTES_SENT: &str = "fs_worker_bytes_sent"; +pub const BATCHES_RECV: &str = "fs_worker_batches_recv"; +pub const BATCHES_SENT: &str = "fs_worker_batches_sent"; +pub const TX_QUEUE_SIZE: &str = "fs_worker_tx_queue_size"; +pub const TX_QUEUE_REM: &str = "fs_worker_tx_queue_rem"; +pub const DESERIALIZATION_ERRORS: &str = "fs_worker_deserialization_errors"; + +pub const LOOKUP_KEY_INDEX_FIELD: &str = "__lookup_key_index"; diff --git a/src/config/global_config.rs b/src/config/global_config.rs index b4f92edd..c76bf4b0 100644 --- a/src/config/global_config.rs +++ b/src/config/global_config.rs @@ -19,6 +19,13 @@ use crate::config::python_config::PythonConfig; use crate::config::service_config::ServiceConfig; use crate::config::wasm_config::WasmConfig; +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +pub struct StreamingConfig { + /// Maximum heap memory (in bytes) available to the streaming runtime's memory pool. + /// Defaults to 256 MiB when absent. + pub max_memory_bytes: Option, +} + #[derive(Debug, Clone, Serialize, Deserialize, Default)] pub struct GlobalConfig { pub service: ServiceConfig, @@ -31,6 +38,10 @@ pub struct GlobalConfig { pub state_storage: crate::config::storage::StateStorageConfig, #[serde(default)] pub task_storage: crate::config::storage::TaskStorageConfig, + #[serde(default)] + pub streaming: StreamingConfig, + #[serde(default)] + pub stream_catalog: crate::config::storage::StreamCatalogConfig, } impl GlobalConfig { diff --git a/src/config/storage.rs b/src/config/storage.rs index e5186648..28396d7d 100644 --- a/src/config/storage.rs +++ b/src/config/storage.rs @@ -118,3 +118,27 @@ impl Default for TaskStorageConfig { } } } + +/// Stream table catalog (`CREATE TABLE` / `SHOW TABLES`) storage. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct StreamCatalogConfig { + /// When `false`, the catalog is in-memory only and is **lost on process restart**. + #[serde(default = "default_stream_catalog_persist")] + pub persist: bool, + /// RocksDB directory for persisted catalog. Default: `{data_dir}/stream_catalog`. + #[serde(default)] + pub db_path: Option, +} + +fn default_stream_catalog_persist() -> bool { + true +} + +impl Default for StreamCatalogConfig { + fn default() -> Self { + Self { + persist: default_stream_catalog_persist(), + db_path: None, + } + } +} diff --git a/src/coordinator/analyze/analyzer.rs b/src/coordinator/analyze/analyzer.rs index 30552191..1b3e65f4 100644 --- a/src/coordinator/analyze/analyzer.rs +++ b/src/coordinator/analyze/analyzer.rs @@ -13,8 +13,10 @@ use super::Analysis; use crate::coordinator::execution_context::ExecutionContext; use crate::coordinator::statement::{ - CreateFunction, CreatePythonFunction, DropFunction, ShowFunctions, StartFunction, Statement, - StatementVisitor, StatementVisitorContext, StatementVisitorResult, StopFunction, + CreateFunction, CreatePythonFunction, CreateTable, DropFunction, DropStreamingTableStatement, + DropTableStatement, ShowCatalogTables, ShowCreateStreamingTable, ShowCreateTable, + ShowFunctions, ShowStreamingTables, StartFunction, Statement, StatementVisitor, + StatementVisitorContext, StatementVisitorResult, StopFunction, StreamingTableStatement, }; use std::fmt; @@ -108,6 +110,22 @@ impl StatementVisitor for Analyzer<'_> { StatementVisitorResult::Analyze(Box::new(stmt.clone())) } + fn visit_show_catalog_tables( + &self, + stmt: &ShowCatalogTables, + _context: &StatementVisitorContext, + ) -> StatementVisitorResult { + StatementVisitorResult::Analyze(Box::new(stmt.clone())) + } + + fn visit_show_create_table( + &self, + stmt: &ShowCreateTable, + _context: &StatementVisitorContext, + ) -> StatementVisitorResult { + StatementVisitorResult::Analyze(Box::new(stmt.clone())) + } + fn visit_create_python_function( &self, stmt: &CreatePythonFunction, @@ -115,4 +133,54 @@ impl StatementVisitor for Analyzer<'_> { ) -> StatementVisitorResult { StatementVisitorResult::Analyze(Box::new(stmt.clone())) } + + fn visit_create_table( + &self, + stmt: &CreateTable, + _context: &StatementVisitorContext, + ) -> StatementVisitorResult { + StatementVisitorResult::Analyze(Box::new(CreateTable::new(stmt.statement.clone()))) + } + + fn visit_streaming_table_statement( + &self, + stmt: &StreamingTableStatement, + _context: &StatementVisitorContext, + ) -> StatementVisitorResult { + StatementVisitorResult::Analyze(Box::new(StreamingTableStatement::new( + stmt.statement.clone(), + ))) + } + + fn visit_drop_table_statement( + &self, + stmt: &DropTableStatement, + _context: &StatementVisitorContext, + ) -> StatementVisitorResult { + StatementVisitorResult::Analyze(Box::new(DropTableStatement::new(stmt.statement.clone()))) + } + + fn visit_show_streaming_tables( + &self, + stmt: &ShowStreamingTables, + _context: &StatementVisitorContext, + ) -> StatementVisitorResult { + StatementVisitorResult::Analyze(Box::new(stmt.clone())) + } + + fn visit_show_create_streaming_table( + &self, + stmt: &ShowCreateStreamingTable, + _context: &StatementVisitorContext, + ) -> StatementVisitorResult { + StatementVisitorResult::Analyze(Box::new(stmt.clone())) + } + + fn visit_drop_streaming_table( + &self, + stmt: &DropStreamingTableStatement, + _context: &StatementVisitorContext, + ) -> StatementVisitorResult { + StatementVisitorResult::Analyze(Box::new(stmt.clone())) + } } diff --git a/src/coordinator/coordinator.rs b/src/coordinator/coordinator.rs index 4ad766d5..3328b698 100644 --- a/src/coordinator/coordinator.rs +++ b/src/coordinator/coordinator.rs @@ -10,128 +10,139 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::sync::Arc; use std::time::Instant; use anyhow::{Context, Result}; -use crate::coordinator::analyze::{Analysis, Analyzer}; +use crate::coordinator::analyze::Analyzer; use crate::coordinator::dataset::ExecuteResult; use crate::coordinator::execution::Executor; use crate::coordinator::plan::{LogicalPlanVisitor, LogicalPlanner, PlanNode}; use crate::coordinator::statement::Statement; -use crate::runtime::taskexecutor::TaskManager; +use crate::sql::schema::StreamSchemaProvider; use super::execution_context::ExecutionContext; +use super::runtime_context::CoordinatorRuntimeContext; +#[derive(Default)] pub struct Coordinator {} -impl Default for Coordinator { - fn default() -> Self { - Self::new() - } -} - impl Coordinator { pub fn new() -> Self { Self {} } - pub fn execute(&self, stmt: &dyn Statement) -> ExecuteResult { - let start_time = Instant::now(); - let context = ExecutionContext::new(); - let execution_id = context.execution_id; + // ======================================================================== + // Plan compilation + // ======================================================================== - match self.execute_pipeline(&context, stmt) { - Ok(result) => { - log::debug!( - "[{}] Execution completed in {}ms", - execution_id, - start_time.elapsed().as_millis() - ); - result - } - Err(e) => { - log::error!( - "[{}] Execution failed after {}ms. Error: {:#}", - execution_id, - start_time.elapsed().as_millis(), - e - ); - ExecuteResult::err(format!("Execution failed: {:#}", e)) - } - } + pub fn compile_plan( + &self, + stmt: &dyn Statement, + schema_provider: StreamSchemaProvider, + ) -> Result> { + self.compile_plan_internal(&ExecutionContext::new(), stmt, schema_provider) } - fn execute_pipeline( + /// Internal pipeline: Analyze → build logical plan → optimize. + fn compile_plan_internal( &self, context: &ExecutionContext, stmt: &dyn Statement, - ) -> Result { - let analysis = self.step_analyze(context, stmt)?; - let plan = self.step_build_logical_plan(&analysis)?; - let optimized_plan = self.step_optimize(&analysis, plan)?; - self.step_execute(optimized_plan) - } - - fn step_analyze(&self, context: &ExecutionContext, stmt: &dyn Statement) -> Result { + schema_provider: StreamSchemaProvider, + ) -> Result> { + let exec_id = context.execution_id; let start = Instant::now(); - let analyzer = Analyzer::new(context); - let result = analyzer + + let analysis = Analyzer::new(context) .analyze(stmt) .map_err(|e| anyhow::anyhow!(e)) - .context("Analyzer phase failed"); - + .context("Analyzer phase failed")?; log::debug!( "[{}] Analyze phase finished in {}ms", - context.execution_id, + exec_id, start.elapsed().as_millis() ); - result - } - fn step_build_logical_plan(&self, analysis: &Analysis) -> Result> { - let visitor = LogicalPlanVisitor::new(); - let plan = visitor.visit(analysis); - Ok(plan) - } - - fn step_optimize( - &self, - analysis: &Analysis, - plan: Box, - ) -> Result> { - let start = Instant::now(); - let planner = LogicalPlanner::new(); - let optimized = planner.optimize(plan, analysis); + let plan = LogicalPlanVisitor::new(schema_provider).visit(&analysis); + let opt_start = Instant::now(); + let optimized = LogicalPlanner::new().optimize(plan, &analysis); log::debug!( - "Optimizer phase finished in {}ms", - start.elapsed().as_millis() + "[{}] Optimizer phase finished in {}ms", + exec_id, + opt_start.elapsed().as_millis() ); + Ok(optimized) } - fn step_execute(&self, plan: Box) -> Result { + // ======================================================================== + // Execution + // ======================================================================== + + pub fn execute(&self, stmt: &dyn Statement) -> ExecuteResult { + match CoordinatorRuntimeContext::try_from_globals() { + Ok(ctx) => self.execute_with_runtime_context(stmt, &ctx), + Err(e) => ExecuteResult::err(e.to_string()), + } + } + + pub async fn execute_with_stream_catalog(&self, stmt: &dyn Statement) -> ExecuteResult { + self.execute(stmt) + } + + /// Same as [`Self::execute`], but uses an explicit [`CoordinatorRuntimeContext`] (e.g. tests or custom wiring). + pub fn execute_with_runtime_context( + &self, + stmt: &dyn Statement, + runtime: &CoordinatorRuntimeContext, + ) -> ExecuteResult { let start = Instant::now(); - let task_manager = match TaskManager::get() { - Ok(tm) => tm, + let context = ExecutionContext::new(); + let exec_id = context.execution_id; + let schema_provider = runtime.planning_schema_provider(); + + let result = (|| -> Result { + let plan = self.compile_plan_internal(&context, stmt, schema_provider)?; + + let exec_start = Instant::now(); + let res = Executor::new( + Arc::clone(&runtime.task_manager), + runtime.catalog_manager.clone(), + Arc::clone(&runtime.job_manager), + ) + .execute(plan.as_ref()) + .map_err(|e| anyhow::anyhow!(e)) + .context("Executor phase failed")?; + + log::debug!( + "[{}] Executor phase finished in {}ms", + exec_id, + exec_start.elapsed().as_millis() + ); + Ok(res) + })(); + + match result { + Ok(res) => { + log::debug!( + "[{}] Execution completed in {}ms", + exec_id, + start.elapsed().as_millis() + ); + res + } Err(e) => { - return Ok(ExecuteResult::err(format!( - "Failed to get TaskManager: {}", + log::error!( + "[{}] Execution failed after {}ms. Error: {:#}", + exec_id, + start.elapsed().as_millis(), e - ))); + ); + ExecuteResult::err(format!("Execution failed: {:#}", e)) } - }; - let executor = Executor::new(task_manager.clone()); - let result = executor - .execute(plan.as_ref()) - .map_err(|e| anyhow::anyhow!(e)) - .context("Executor phase failed"); - - log::debug!( - "Executor phase finished in {}ms", - start.elapsed().as_millis() - ); - result + } } } diff --git a/src/coordinator/dataset/mod.rs b/src/coordinator/dataset/mod.rs index b72613da..bbcac6f0 100644 --- a/src/coordinator/dataset/mod.rs +++ b/src/coordinator/dataset/mod.rs @@ -12,8 +12,16 @@ mod data_set; mod execute_result; +mod show_catalog_tables_result; +mod show_create_streaming_table_result; +mod show_create_table_result; mod show_functions_result; +mod show_streaming_tables_result; pub use data_set::{DataSet, empty_record_batch}; pub use execute_result::ExecuteResult; +pub use show_catalog_tables_result::ShowCatalogTablesResult; +pub use show_create_streaming_table_result::ShowCreateStreamingTableResult; +pub use show_create_table_result::ShowCreateTableResult; pub use show_functions_result::ShowFunctionsResult; +pub use show_streaming_tables_result::ShowStreamingTablesResult; diff --git a/src/coordinator/dataset/show_catalog_tables_result.rs b/src/coordinator/dataset/show_catalog_tables_result.rs new file mode 100644 index 00000000..9811ff82 --- /dev/null +++ b/src/coordinator/dataset/show_catalog_tables_result.rs @@ -0,0 +1,105 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use arrow_array::{Int32Array, StringArray}; +use arrow_schema::{DataType, Field, Schema}; +use datafusion::arrow::datatypes::Schema as DfSchema; + +use super::DataSet; +use crate::sql::schema::table::Table as CatalogTable; +use crate::sql::schema::{catalog_table_row_detail, schema_columns_one_line}; + +#[derive(Clone, Debug)] +pub struct ShowCatalogTablesResult { + names: Vec, + kinds: Vec, + column_counts: Vec, + schema_lines: Vec, + details: Vec, +} + +impl ShowCatalogTablesResult { + pub fn from_tables(tables: &[Arc]) -> Self { + let mut names = Vec::with_capacity(tables.len()); + let mut kinds = Vec::with_capacity(tables.len()); + let mut column_counts = Vec::with_capacity(tables.len()); + let mut schema_lines = Vec::with_capacity(tables.len()); + let mut details = Vec::with_capacity(tables.len()); + + for t in tables { + let schema = match t.as_ref() { + CatalogTable::ConnectorTable(source) | CatalogTable::LookupTable(source) => { + source.produce_physical_schema() + } + CatalogTable::TableFromQuery { .. } => DfSchema::new(t.get_fields()), + }; + let ncols = schema.fields().len() as i32; + names.push(t.name().to_string()); + kinds.push( + match t.as_ref() { + CatalogTable::ConnectorTable(_) => "SOURCE", + CatalogTable::LookupTable(_) => "LOOKUP", + CatalogTable::TableFromQuery { .. } => "QUERY", + } + .to_string(), + ); + column_counts.push(ncols); + schema_lines.push(schema_columns_one_line(&schema)); + details.push(catalog_table_row_detail(t.as_ref())); + } + + Self { + names, + kinds, + column_counts, + schema_lines, + details, + } + } +} + +impl DataSet for ShowCatalogTablesResult { + fn to_record_batch(&self) -> arrow_array::RecordBatch { + let schema = Arc::new(Schema::new(vec![ + Field::new("table_name", DataType::Utf8, false), + Field::new("kind", DataType::Utf8, false), + Field::new("column_count", DataType::Int32, false), + Field::new("schema_columns", DataType::Utf8, false), + Field::new("details", DataType::Utf8, false), + ])); + + arrow_array::RecordBatch::try_new( + schema, + vec![ + Arc::new(StringArray::from( + self.names.iter().map(|s| s.as_str()).collect::>(), + )), + Arc::new(StringArray::from( + self.kinds.iter().map(|s| s.as_str()).collect::>(), + )), + Arc::new(Int32Array::from(self.column_counts.clone())), + Arc::new(StringArray::from( + self.schema_lines + .iter() + .map(|s| s.as_str()) + .collect::>(), + )), + Arc::new(StringArray::from( + self.details.iter().map(|s| s.as_str()).collect::>(), + )), + ], + ) + .unwrap_or_else(|_| arrow_array::RecordBatch::new_empty(Arc::new(Schema::empty()))) + } +} diff --git a/src/coordinator/dataset/show_create_streaming_table_result.rs b/src/coordinator/dataset/show_create_streaming_table_result.rs new file mode 100644 index 00000000..1796814b --- /dev/null +++ b/src/coordinator/dataset/show_create_streaming_table_result.rs @@ -0,0 +1,69 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use arrow_array::StringArray; +use arrow_schema::{DataType, Field, Schema}; +use protocol::function_stream_graph::FsProgram; + +use crate::sql::common::render_program_topology; + +use super::DataSet; + +#[derive(Clone, Debug)] +pub struct ShowCreateStreamingTableResult { + table_name: String, + status: String, + pipeline_detail: String, + program: FsProgram, +} + +impl ShowCreateStreamingTableResult { + pub fn new( + table_name: String, + status: String, + pipeline_detail: String, + program: FsProgram, + ) -> Self { + Self { + table_name, + status, + pipeline_detail, + program, + } + } +} + +impl DataSet for ShowCreateStreamingTableResult { + fn to_record_batch(&self) -> arrow_array::RecordBatch { + let topology = render_program_topology(&self.program); + + let schema = Arc::new(Schema::new(vec![ + Field::new("Streaming Table", DataType::Utf8, false), + Field::new("Status", DataType::Utf8, false), + Field::new("Pipelines", DataType::Utf8, false), + Field::new("Topology", DataType::Utf8, false), + ])); + + arrow_array::RecordBatch::try_new( + schema, + vec![ + Arc::new(StringArray::from(vec![self.table_name.as_str()])), + Arc::new(StringArray::from(vec![self.status.as_str()])), + Arc::new(StringArray::from(vec![self.pipeline_detail.as_str()])), + Arc::new(StringArray::from(vec![topology.as_str()])), + ], + ) + .unwrap_or_else(|_| arrow_array::RecordBatch::new_empty(Arc::new(Schema::empty()))) + } +} diff --git a/src/coordinator/dataset/show_create_table_result.rs b/src/coordinator/dataset/show_create_table_result.rs new file mode 100644 index 00000000..47f49d59 --- /dev/null +++ b/src/coordinator/dataset/show_create_table_result.rs @@ -0,0 +1,51 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use arrow_array::StringArray; +use arrow_schema::{DataType, Field, Schema}; + +use super::DataSet; + +#[derive(Clone, Debug)] +pub struct ShowCreateTableResult { + table_name: String, + create_sql: String, +} + +impl ShowCreateTableResult { + pub fn new(table_name: String, create_sql: String) -> Self { + Self { + table_name, + create_sql, + } + } +} + +impl DataSet for ShowCreateTableResult { + fn to_record_batch(&self) -> arrow_array::RecordBatch { + let schema = Arc::new(Schema::new(vec![ + Field::new("Table", DataType::Utf8, false), + Field::new("Create Table", DataType::Utf8, false), + ])); + + arrow_array::RecordBatch::try_new( + schema, + vec![ + Arc::new(StringArray::from(vec![self.table_name.as_str()])), + Arc::new(StringArray::from(vec![self.create_sql.as_str()])), + ], + ) + .unwrap_or_else(|_| arrow_array::RecordBatch::new_empty(Arc::new(Schema::empty()))) + } +} diff --git a/src/coordinator/dataset/show_streaming_tables_result.rs b/src/coordinator/dataset/show_streaming_tables_result.rs new file mode 100644 index 00000000..cae597ac --- /dev/null +++ b/src/coordinator/dataset/show_streaming_tables_result.rs @@ -0,0 +1,79 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use arrow_array::{Int32Array, StringArray}; +use arrow_schema::{DataType, Field, Schema}; + +use super::DataSet; +use crate::runtime::streaming::job::StreamingJobSummary; + +#[derive(Clone, Debug)] +pub struct ShowStreamingTablesResult { + jobs: Vec, +} + +impl ShowStreamingTablesResult { + pub fn new(jobs: Vec) -> Self { + Self { jobs } + } +} + +impl DataSet for ShowStreamingTablesResult { + fn to_record_batch(&self) -> arrow_array::RecordBatch { + let schema = Arc::new(Schema::new(vec![ + Field::new("job_id", DataType::Utf8, false), + Field::new("status", DataType::Utf8, false), + Field::new("pipeline_count", DataType::Int32, false), + Field::new("uptime", DataType::Utf8, false), + ])); + + let job_ids: Vec<&str> = self.jobs.iter().map(|j| j.job_id.as_str()).collect(); + let statuses: Vec<&str> = self.jobs.iter().map(|j| j.status.as_str()).collect(); + let pipeline_counts: Vec = self.jobs.iter().map(|j| j.pipeline_count).collect(); + let uptimes: Vec = self + .jobs + .iter() + .map(|j| format_duration(j.uptime_secs)) + .collect(); + let uptime_refs: Vec<&str> = uptimes.iter().map(|s| s.as_str()).collect(); + + arrow_array::RecordBatch::try_new( + schema, + vec![ + Arc::new(StringArray::from(job_ids)), + Arc::new(StringArray::from(statuses)), + Arc::new(Int32Array::from(pipeline_counts)), + Arc::new(StringArray::from(uptime_refs)), + ], + ) + .unwrap_or_else(|_| arrow_array::RecordBatch::new_empty(Arc::new(Schema::empty()))) + } +} + +fn format_duration(total_secs: u64) -> String { + let days = total_secs / 86400; + let hours = (total_secs % 86400) / 3600; + let mins = (total_secs % 3600) / 60; + let secs = total_secs % 60; + + if days > 0 { + format!("{days}d {hours}h {mins}m {secs}s") + } else if hours > 0 { + format!("{hours}h {mins}m {secs}s") + } else if mins > 0 { + format!("{mins}m {secs}s") + } else { + format!("{secs}s") + } +} diff --git a/src/coordinator/execution/executor.rs b/src/coordinator/execution/executor.rs index 7e44217e..0000d0cf 100644 --- a/src/coordinator/execution/executor.rs +++ b/src/coordinator/execution/executor.rs @@ -10,16 +10,30 @@ // See the License for the specific language governing permissions and // limitations under the License. -use crate::coordinator::dataset::{ExecuteResult, ShowFunctionsResult, empty_record_batch}; +use std::sync::Arc; + +use protocol::function_stream_graph::FsProgram; +use thiserror::Error; +use tracing::{debug, info, warn}; + +use crate::coordinator::dataset::{ + ExecuteResult, ShowCatalogTablesResult, ShowCreateStreamingTableResult, ShowCreateTableResult, + ShowFunctionsResult, ShowStreamingTablesResult, empty_record_batch, +}; use crate::coordinator::plan::{ - CreateFunctionPlan, CreatePythonFunctionPlan, DropFunctionPlan, PlanNode, PlanVisitor, - PlanVisitorContext, PlanVisitorResult, ShowFunctionsPlan, StartFunctionPlan, StopFunctionPlan, + CreateFunctionPlan, CreatePythonFunctionPlan, CreateTablePlan, CreateTablePlanBody, + DropFunctionPlan, DropStreamingTablePlan, DropTablePlan, LookupTablePlan, PlanNode, + PlanVisitor, PlanVisitorContext, PlanVisitorResult, ShowCatalogTablesPlan, + ShowCreateStreamingTablePlan, ShowCreateTablePlan, ShowFunctionsPlan, ShowStreamingTablesPlan, + StartFunctionPlan, StopFunctionPlan, StreamingTable, StreamingTableConnectorPlan, }; use crate::coordinator::statement::{ConfigSource, FunctionSource}; +use crate::runtime::streaming::job::JobManager; +use crate::runtime::streaming::protocol::control::StopMode; use crate::runtime::taskexecutor::TaskManager; -use std::sync::Arc; -use thiserror::Error; -use tracing::{debug, info}; +use crate::sql::schema::show_create_catalog_table; +use crate::sql::schema::table::Table as CatalogTable; +use crate::storage::stream_catalog::CatalogManager; #[derive(Error, Debug)] pub enum ExecuteError { @@ -35,11 +49,21 @@ pub enum ExecuteError { pub struct Executor { task_manager: Arc, + catalog_manager: Arc, + job_manager: Arc, } impl Executor { - pub fn new(task_manager: Arc) -> Self { - Self { task_manager } + pub fn new( + task_manager: Arc, + catalog_manager: Arc, + job_manager: Arc, + ) -> Self { + Self { + task_manager, + catalog_manager, + job_manager, + } } pub fn execute(&self, plan: &dyn PlanNode) -> Result { @@ -50,8 +74,11 @@ impl Executor { match visitor_result { PlanVisitorResult::Execute(result) => { - let elapsed = timer.elapsed(); - debug!(target: "executor", elapsed_ms = elapsed.as_millis(), "Execution completed"); + debug!( + target: "executor", + elapsed_ms = timer.elapsed().as_millis(), + "Execution completed" + ); result } } @@ -59,23 +86,22 @@ impl Executor { } impl PlanVisitor for Executor { - #[allow(clippy::redundant_closure_call)] fn visit_create_function( &self, plan: &CreateFunctionPlan, _context: &PlanVisitorContext, ) -> PlanVisitorResult { - let result = (|| -> Result { + let execute = || -> Result { let function_bytes = match &plan.function_source { FunctionSource::Path(path) => std::fs::read(path).map_err(|e| { - ExecuteError::Validation(format!("Failed to read function at {}: {}", path, e)) + ExecuteError::Validation(format!("Failed to read function at {path}: {e}")) })?, FunctionSource::Bytes(bytes) => bytes.clone(), }; let config_bytes = match &plan.config_source { Some(ConfigSource::Path(path)) => std::fs::read(path).map_err(|e| { - ExecuteError::Validation(format!("Failed to read config at {}: {}", path, e)) + ExecuteError::Validation(format!("Failed to read config at {path}: {e}")) })?, Some(ConfigSource::Bytes(bytes)) => bytes.clone(), None => { @@ -88,35 +114,34 @@ impl PlanVisitor for Executor { info!(config_size = config_bytes.len(), "Registering Wasm task"); self.task_manager .register_task(&config_bytes, &function_bytes) - .map_err(|e| ExecuteError::Task(format!("Registration failed: {:?}", e)))?; + .map_err(|e| ExecuteError::Task(format!("Registration failed: {e:?}")))?; Ok(ExecuteResult::ok_with_data( "Function registered successfully", empty_record_batch(), )) - })(); + }; - PlanVisitorResult::Execute(result) + PlanVisitorResult::Execute(execute()) } - #[allow(clippy::redundant_closure_call)] fn visit_drop_function( &self, plan: &DropFunctionPlan, _context: &PlanVisitorContext, ) -> PlanVisitorResult { - let result = (|| -> Result { + let execute = || -> Result { self.task_manager .remove_task(&plan.name) - .map_err(|e| ExecuteError::Task(format!("Removal failed: {}", e)))?; + .map_err(|e| ExecuteError::Task(format!("Removal failed: {e}")))?; Ok(ExecuteResult::ok_with_data( format!("Function '{}' dropped", plan.name), empty_record_batch(), )) - })(); + }; - PlanVisitorResult::Execute(result) + PlanVisitorResult::Execute(execute()) } fn visit_start_function( @@ -138,48 +163,87 @@ impl PlanVisitor for Executor { PlanVisitorResult::Execute(result) } - #[allow(clippy::redundant_closure_call)] fn visit_show_functions( &self, _plan: &ShowFunctionsPlan, _context: &PlanVisitorContext, ) -> PlanVisitorResult { - let result = { - let functions = self.task_manager.list_all_functions(); + let functions = self.task_manager.list_all_functions(); + let result = ExecuteResult::ok_with_data( + format!("Found {} task(s)", functions.len()), + ShowFunctionsResult::new(functions), + ); + PlanVisitorResult::Execute(Ok(result)) + } + + fn visit_show_catalog_tables( + &self, + _plan: &ShowCatalogTablesPlan, + _context: &PlanVisitorContext, + ) -> PlanVisitorResult { + let tables = match self.catalog_manager.list_catalog_tables() { + Ok(tables) => tables, + Err(e) => { + return PlanVisitorResult::Execute(Err(ExecuteError::Internal(e.to_string()))); + } + }; + let n = tables.len(); + let result = ExecuteResult::ok_with_data( + format!("{n} stream catalog table(s)"), + ShowCatalogTablesResult::from_tables(&tables), + ); + PlanVisitorResult::Execute(Ok(result)) + } + + fn visit_show_create_table( + &self, + plan: &ShowCreateTablePlan, + _context: &PlanVisitorContext, + ) -> PlanVisitorResult { + let execute = || -> Result { + let t = self + .catalog_manager + .get_catalog_table(&plan.table_name) + .map_err(|e| ExecuteError::Internal(e.to_string()))? + .ok_or_else(|| { + ExecuteError::Validation(format!( + "Table '{}' not found in stream catalog", + plan.table_name + )) + })?; + let ddl = show_create_catalog_table(t.as_ref()); Ok(ExecuteResult::ok_with_data( - format!("Found {} task(s)", functions.len()), - ShowFunctionsResult::new(functions), + format!("SHOW CREATE TABLE {}", plan.table_name), + ShowCreateTableResult::new(plan.table_name.clone(), ddl), )) }; - - PlanVisitorResult::Execute(result) + PlanVisitorResult::Execute(execute()) } - #[allow(clippy::redundant_closure_call)] fn visit_create_python_function( &self, plan: &CreatePythonFunctionPlan, _context: &PlanVisitorContext, ) -> PlanVisitorResult { - let result = (|| -> Result { - let modules: Vec<(String, Vec)> = plan + let execute = || -> Result { + let modules = plan .modules .iter() .map(|m| (m.name.clone(), m.bytes.clone())) - .collect(); + .collect::>(); self.task_manager .register_python_task(plan.config_content.as_bytes(), &modules) - .map_err(|e| ExecuteError::Task(format!("Python registration failed: {}", e)))?; + .map_err(|e| ExecuteError::Task(format!("Python registration failed: {e}")))?; Ok(ExecuteResult::ok_with_data( format!("Python function '{}' deployed", plan.class_name), empty_record_batch(), )) - })(); + }; - PlanVisitorResult::Execute(result) + PlanVisitorResult::Execute(execute()) } fn visit_stop_function( @@ -200,4 +264,256 @@ impl PlanVisitor for Executor { PlanVisitorResult::Execute(result) } + + fn visit_create_table_plan( + &self, + plan: &CreateTablePlan, + _context: &PlanVisitorContext, + ) -> PlanVisitorResult { + let execute = || -> Result { + let (table_name, if_not_exists, catalog_table) = match &plan.body { + CreateTablePlanBody::ConnectorSource { + source_table, + if_not_exists, + } => { + let table_name = source_table.name().to_string(); + let table_instance = + CatalogTable::ConnectorTable(source_table.as_ref().clone()); + (table_name, *if_not_exists, table_instance) + } + CreateTablePlanBody::DataFusion(_) => { + return Err(ExecuteError::Internal( + "Operation not supported: Currently, the system strictly supports creating tables backed by an external Connector Source (e.g., Kafka, Postgres). In-memory tables, Views, or CTAS (Create Table As Select) are not supported." + .into(), + )); + } + }; + + if if_not_exists && self.catalog_manager.has_catalog_table(&table_name) { + return Ok(ExecuteResult::ok(format!( + "Table '{table_name}' already exists (skipped)" + ))); + } + + self.catalog_manager + .add_catalog_table(catalog_table) + .map_err(|e| { + ExecuteError::Internal(format!( + "Failed to register connector source table '{}': {}", + table_name, e + )) + })?; + + Ok(ExecuteResult::ok(format!( + "Created connector source table '{table_name}'" + ))) + }; + + PlanVisitorResult::Execute(execute()) + } + + fn visit_streaming_table( + &self, + plan: &StreamingTable, + _context: &PlanVisitorContext, + ) -> PlanVisitorResult { + let execute = || -> Result { + let fs_program: FsProgram = plan.program.clone().into(); + let job_manager: Arc = Arc::clone(&self.job_manager); + + let job_id = plan.name.clone(); + let job_id = tokio::task::block_in_place(|| { + tokio::runtime::Handle::current() + .block_on(job_manager.submit_job(job_id, fs_program.clone())) + }) + .map_err(|e| ExecuteError::Internal(format!("Failed to submit streaming job: {e}")))?; + + self.catalog_manager + .persist_streaming_job( + &plan.name, + &fs_program, + plan.comment.as_deref().unwrap_or(""), + ) + .map_err(|e| { + ExecuteError::Internal(format!( + "Streaming job '{}' submitted but persistence failed: {e}", + plan.name + )) + })?; + + info!( + job_id = %job_id, + table = %plan.name, + "Streaming job submitted and persisted" + ); + + Ok(ExecuteResult::ok_with_data( + format!( + "Streaming table '{}' created, job_id = {}", + plan.name, job_id + ), + empty_record_batch(), + )) + }; + + PlanVisitorResult::Execute(execute()) + } + + fn visit_lookup_table( + &self, + _plan: &LookupTablePlan, + _context: &PlanVisitorContext, + ) -> PlanVisitorResult { + PlanVisitorResult::Execute(Err(ExecuteError::Internal( + "LookupTable execution not yet implemented".to_string(), + ))) + } + + fn visit_streaming_connector_table( + &self, + _plan: &StreamingTableConnectorPlan, + _context: &PlanVisitorContext, + ) -> PlanVisitorResult { + PlanVisitorResult::Execute(Err(ExecuteError::Internal( + "StreamingTableConnector execution not yet implemented".to_string(), + ))) + } + + fn visit_drop_table_plan( + &self, + plan: &DropTablePlan, + _context: &PlanVisitorContext, + ) -> PlanVisitorResult { + let execute = || -> Result { + self.catalog_manager + .drop_catalog_table(&plan.table_name, plan.if_exists) + .map_err(|e| ExecuteError::Internal(e.to_string()))?; + + Ok(ExecuteResult::ok(format!( + "Dropped table '{}'", + plan.table_name + ))) + }; + + PlanVisitorResult::Execute(execute()) + } + + fn visit_show_streaming_tables( + &self, + _plan: &ShowStreamingTablesPlan, + _context: &PlanVisitorContext, + ) -> PlanVisitorResult { + let execute = || -> Result { + let jobs = self.job_manager.list_jobs(); + let n = jobs.len(); + Ok(ExecuteResult::ok_with_data( + format!("{n} streaming table(s)"), + ShowStreamingTablesResult::new(jobs), + )) + }; + PlanVisitorResult::Execute(execute()) + } + + fn visit_show_create_streaming_table( + &self, + plan: &ShowCreateStreamingTablePlan, + _context: &PlanVisitorContext, + ) -> PlanVisitorResult { + let execute = || -> Result { + let detail = self + .job_manager + .get_job_detail(&plan.table_name) + .ok_or_else(|| { + ExecuteError::Validation(format!( + "Streaming table '{}' not found in active jobs", + plan.table_name + )) + })?; + + let pipeline_lines: Vec = detail + .pipelines + .iter() + .map(|p| format!(" pipeline[{}]: {}", p.pipeline_id, p.status)) + .collect(); + let pipeline_detail = if pipeline_lines.is_empty() { + "(no pipelines)".to_string() + } else { + pipeline_lines.join("\n") + }; + + Ok(ExecuteResult::ok_with_data( + format!("SHOW CREATE STREAMING TABLE {}", plan.table_name), + ShowCreateStreamingTableResult::new( + plan.table_name.clone(), + detail.status.to_string(), + pipeline_detail, + detail.program, + ), + )) + }; + PlanVisitorResult::Execute(execute()) + } + + fn visit_drop_streaming_table( + &self, + plan: &DropStreamingTablePlan, + _context: &PlanVisitorContext, + ) -> PlanVisitorResult { + let execute = || -> Result { + let job_exists = self.job_manager.has_job(&plan.table_name); + + if !job_exists && !plan.if_exists { + return Err(ExecuteError::Validation(format!( + "Streaming table '{}' not found in active jobs", + plan.table_name + ))); + } + + if job_exists { + let job_manager = Arc::clone(&self.job_manager); + let table_name = plan.table_name.clone(); + tokio::task::block_in_place(|| { + tokio::runtime::Handle::current() + .block_on(job_manager.remove_job(&table_name, StopMode::Graceful)) + }) + .map_err(|e| { + ExecuteError::Internal(format!( + "Failed to stop streaming job '{}': {}", + plan.table_name, e + )) + })?; + + info!( + table = %plan.table_name, + "Streaming job stopped and removed" + ); + } + + if let Err(e) = self.catalog_manager.remove_streaming_job(&plan.table_name) { + warn!( + table = %plan.table_name, + error = %e, + "Failed to remove streaming job persisted definition (non-fatal)" + ); + } + + let _ = self + .catalog_manager + .drop_catalog_table(&plan.table_name, true); + + if job_exists { + Ok(ExecuteResult::ok(format!( + "Dropped streaming table '{}'", + plan.table_name + ))) + } else { + Ok(ExecuteResult::ok(format!( + "Streaming table '{}' does not exist (skipped)", + plan.table_name + ))) + } + }; + + PlanVisitorResult::Execute(execute()) + } } diff --git a/src/coordinator/mod.rs b/src/coordinator/mod.rs index 0b94d4bf..38d4637f 100644 --- a/src/coordinator/mod.rs +++ b/src/coordinator/mod.rs @@ -17,11 +17,15 @@ mod dataset; mod execution; mod execution_context; mod plan; +mod runtime_context; mod statement; +mod tool; pub use coordinator::Coordinator; pub use dataset::{DataSet, ShowFunctionsResult}; pub use statement::{ - CreateFunction, CreatePythonFunction, DropFunction, PythonModule, ShowFunctions, StartFunction, - Statement, StopFunction, + CreateFunction, CreatePythonFunction, CreateTable, DropFunction, DropStreamingTableStatement, + DropTableStatement, PythonModule, ShowCatalogTables, ShowCreateStreamingTable, ShowCreateTable, + ShowFunctions, ShowStreamingTables, StartFunction, Statement, StopFunction, + StreamingTableStatement, }; diff --git a/src/coordinator/plan/create_table_plan.rs b/src/coordinator/plan/create_table_plan.rs new file mode 100644 index 00000000..11ae14a4 --- /dev/null +++ b/src/coordinator/plan/create_table_plan.rs @@ -0,0 +1,55 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion::logical_expr::LogicalPlan; + +use crate::sql::schema::SourceTable; + +use super::{PlanNode, PlanVisitor, PlanVisitorContext, PlanVisitorResult}; + +/// Payload for [`CreateTablePlan`]: either a DataFusion DDL plan or a connector `CREATE TABLE` (no `AS SELECT`). +#[derive(Debug, Clone)] +pub enum CreateTablePlanBody { + DataFusion(Box), + ConnectorSource { + source_table: Box, + if_not_exists: bool, + }, +} + +#[derive(Debug, Clone)] +pub struct CreateTablePlan { + pub body: CreateTablePlanBody, +} + +impl CreateTablePlan { + pub fn new(logical_plan: LogicalPlan) -> Self { + Self { + body: CreateTablePlanBody::DataFusion(Box::new(logical_plan)), + } + } + + pub fn connector_source(source_table: SourceTable, if_not_exists: bool) -> Self { + Self { + body: CreateTablePlanBody::ConnectorSource { + source_table: Box::new(source_table), + if_not_exists, + }, + } + } +} + +impl PlanNode for CreateTablePlan { + fn accept(&self, visitor: &dyn PlanVisitor, context: &PlanVisitorContext) -> PlanVisitorResult { + visitor.visit_create_table_plan(self, context) + } +} diff --git a/src/sql/parser/mod.rs b/src/coordinator/plan/drop_streaming_table_plan.rs similarity index 52% rename from src/sql/parser/mod.rs rename to src/coordinator/plan/drop_streaming_table_plan.rs index 11f4b18e..d06dc836 100644 --- a/src/sql/parser/mod.rs +++ b/src/coordinator/plan/drop_streaming_table_plan.rs @@ -10,33 +10,25 @@ // See the License for the specific language governing permissions and // limitations under the License. -mod sql_parser; +use super::{PlanNode, PlanVisitor, PlanVisitorContext, PlanVisitorResult}; -pub use sql_parser::SqlParser; - -#[derive(Debug)] -pub struct ParseError { - pub message: String, +#[derive(Debug, Clone)] +pub struct DropStreamingTablePlan { + pub table_name: String, + pub if_exists: bool, } -impl std::fmt::Display for ParseError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "Parse error: {}", self.message) - } -} - -impl std::error::Error for ParseError {} - -impl From for ParseError { - fn from(message: String) -> Self { - ParseError { message } +impl DropStreamingTablePlan { + pub fn new(table_name: String, if_exists: bool) -> Self { + Self { + table_name, + if_exists, + } } } -impl ParseError { - pub fn new(message: impl Into) -> Self { - Self { - message: message.into(), - } +impl PlanNode for DropStreamingTablePlan { + fn accept(&self, visitor: &dyn PlanVisitor, context: &PlanVisitorContext) -> PlanVisitorResult { + visitor.visit_drop_streaming_table(self, context) } } diff --git a/src/coordinator/plan/drop_table_plan.rs b/src/coordinator/plan/drop_table_plan.rs new file mode 100644 index 00000000..7d80a7b7 --- /dev/null +++ b/src/coordinator/plan/drop_table_plan.rs @@ -0,0 +1,34 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::{PlanNode, PlanVisitor, PlanVisitorContext, PlanVisitorResult}; + +#[derive(Debug, Clone)] +pub struct DropTablePlan { + pub table_name: String, + pub if_exists: bool, +} + +impl DropTablePlan { + pub fn new(table_name: String, if_exists: bool) -> Self { + Self { + table_name, + if_exists, + } + } +} + +impl PlanNode for DropTablePlan { + fn accept(&self, visitor: &dyn PlanVisitor, context: &PlanVisitorContext) -> PlanVisitorResult { + visitor.visit_drop_table_plan(self, context) + } +} diff --git a/src/coordinator/plan/logical_plan_visitor.rs b/src/coordinator/plan/logical_plan_visitor.rs index 536fec37..6adc6420 100644 --- a/src/coordinator/plan/logical_plan_visitor.rs +++ b/src/coordinator/plan/logical_plan_visitor.rs @@ -10,59 +10,337 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::sync::Arc; + +use datafusion::common::{Result, plan_datafusion_err, plan_err}; +use datafusion::execution::SessionStateBuilder; +use datafusion::sql::sqlparser::ast::{ + CreateTable as SqlCreateTable, Expr as SqlExpr, ObjectType, SqlOption, + Statement as DFStatement, TableConstraint, +}; +use datafusion_common::TableReference; +use datafusion_execution::config::SessionConfig; +use datafusion_expr::{Expr, Extension, LogicalPlan, col}; +use sqlparser::ast::Statement; +use tracing::debug; + use crate::coordinator::analyze::analysis::Analysis; use crate::coordinator::plan::{ - CreateFunctionPlan, CreatePythonFunctionPlan, DropFunctionPlan, PlanNode, ShowFunctionsPlan, - StartFunctionPlan, StopFunctionPlan, + CreateFunctionPlan, CreatePythonFunctionPlan, CreateTablePlan, DropFunctionPlan, + DropStreamingTablePlan, DropTablePlan, PlanNode, ShowCatalogTablesPlan, + ShowCreateStreamingTablePlan, ShowCreateTablePlan, ShowFunctionsPlan, ShowStreamingTablesPlan, + StartFunctionPlan, StopFunctionPlan, StreamingTable, }; use crate::coordinator::statement::{ - CreateFunction, CreatePythonFunction, DropFunction, ShowFunctions, StartFunction, - StatementVisitor, StatementVisitorContext, StatementVisitorResult, StopFunction, + CreateFunction, CreatePythonFunction, CreateTable, DropFunction, DropStreamingTableStatement, + DropTableStatement, ShowCatalogTables, ShowCreateStreamingTable, ShowCreateTable, + ShowFunctions, ShowStreamingTables, StartFunction, StatementVisitor, StatementVisitorContext, + StatementVisitorResult, StopFunction, StreamingTableStatement, }; +use crate::coordinator::tool::ConnectorOptions; +use crate::sql::analysis::{StreamSchemaProvider, maybe_add_key_extension_to_sink, rewrite_sinks}; +use crate::sql::common::with_option_keys as opt; +use crate::sql::functions::{is_json_union, serialize_outgoing_json}; +use crate::sql::logical_node::logical::{LogicalProgram, ProgramConfig}; +use crate::sql::logical_node::sink::StreamEgressNode; +use crate::sql::logical_planner::optimizers::{ChainingOptimizer, produce_optimized_plan}; +use crate::sql::logical_planner::planner::PlanToGraphVisitor; +use crate::sql::rewrite_plan; +use crate::sql::schema::source_table::SourceTable; +use crate::sql::schema::{ColumnDescriptor, ConnectionType, Table}; -#[derive(Debug, Default)] -pub struct LogicalPlanVisitor; +#[derive(Clone)] +pub struct LogicalPlanVisitor { + schema_provider: StreamSchemaProvider, +} impl LogicalPlanVisitor { - pub fn new() -> Self { - Self + pub fn new(schema_provider: StreamSchemaProvider) -> Self { + Self { schema_provider } } pub fn visit(&self, analysis: &Analysis) -> Box { - let context = StatementVisitorContext::Empty; let stmt = analysis.statement(); + let context = StatementVisitorContext::Empty; - let result = stmt.accept(self, &context); - - match result { + match stmt.accept(self, &context) { StatementVisitorResult::Plan(plan) => plan, - _ => panic!("LogicalPlanVisitor should return Plan"), + _ => panic!("Fatal: LogicalPlanVisitor must yield a PlanNode variant"), } } + + pub fn build_streaming_table( + schema_provider: &StreamSchemaProvider, + stmt: &StreamingTableStatement, + ) -> Result { + Self::new(schema_provider.clone()).compile_streaming_sink(stmt) + } + + fn compile_streaming_sink(&self, stmt: &StreamingTableStatement) -> Result { + let DFStatement::CreateStreamingTable { + name, + with_options, + comment, + query, + } = &stmt.statement + else { + return plan_err!("Statement mismatch: Expected CREATE STREAMING TABLE AST node"); + }; + + let sink_table_name = name.to_string(); + debug!( + "Initiating streaming sink compilation for identifier: {}", + sink_table_name + ); + + let mut sink_properties = ConnectorOptions::new(with_options, &None)?; + let connector_type = sink_properties + .pull_opt_str(opt::CONNECTOR)? + .ok_or_else(|| { + plan_datafusion_err!( + "Validation Error: Streaming table '{}' requires the '{}' property", + sink_table_name, + opt::CONNECTOR + ) + })?; + + let partition_keys = Self::extract_partitioning_keys(&mut sink_properties)?; + + let sink_description = comment + .as_deref() + .map(str::trim) + .filter(|s| !s.is_empty()) + .map(str::to_string) + .unwrap_or_else(|| format!("sink `{}` ({connector_type})", sink_table_name)); + + let mut query_logical_plan = rewrite_plan( + produce_optimized_plan(&Statement::Query(query.clone()), &self.schema_provider)?, + &self.schema_provider, + )?; + + if query_logical_plan + .schema() + .fields() + .iter() + .any(|f| is_json_union(f.data_type())) + { + query_logical_plan = + serialize_outgoing_json(&self.schema_provider, Arc::new(query_logical_plan)); + } + + let output_schema_fields = query_logical_plan + .schema() + .fields() + .iter() + .map(|f| ColumnDescriptor::from((**f).clone())) + .collect::>(); + + let mut sink_definition = SourceTable::from_options( + &sink_table_name, + &connector_type, + false, + output_schema_fields, + vec![], + None, + &mut sink_properties, + None, + &self.schema_provider, + Some(ConnectionType::Sink), + sink_description, + )?; + sink_definition.partition_exprs = Arc::new(partition_keys); + + let output_schema = query_logical_plan.schema().clone(); + let sink_plan_node = StreamEgressNode::try_new( + TableReference::bare(sink_table_name.clone()), + Table::ConnectorTable(sink_definition.clone()), + output_schema, + query_logical_plan, + )?; + + let mut rewritten_plans = rewrite_sinks(vec![maybe_add_key_extension_to_sink( + LogicalPlan::Extension(Extension { + node: Arc::new(sink_plan_node), + }), + )?])?; + + let final_logical_plan = rewritten_plans.remove(0); + + let validated_program = self.validate_graph_topology(&final_logical_plan)?; + + Ok(StreamingTable { + name: sink_table_name, + comment: comment.clone(), + program: validated_program, + }) + } + + fn validate_graph_topology(&self, logical_plan: &LogicalPlan) -> Result { + let mut session_config = SessionConfig::new(); + let opts = session_config.options_mut(); + opts.optimizer.enable_round_robin_repartition = false; + opts.optimizer.repartition_aggregations = false; + opts.optimizer.repartition_windows = false; + opts.optimizer.repartition_sorts = false; + opts.optimizer.repartition_joins = false; + opts.execution.target_partitions = 1; + + let session_state = SessionStateBuilder::new() + .with_config(session_config) + .with_default_features() + .with_physical_optimizer_rules(vec![]) + .build(); + + let mut graph_compiler = PlanToGraphVisitor::new(&self.schema_provider, &session_state); + graph_compiler.add_plan(logical_plan.clone())?; + + let mut executable_program = + LogicalProgram::new(graph_compiler.into_graph(), ProgramConfig::default()); + executable_program.optimize(&ChainingOptimizer {}); + + Ok(executable_program) + } + + fn extract_partitioning_keys(options: &mut ConnectorOptions) -> Result>> { + options + .pull_opt_str(opt::PARTITION_BY)? + .map(|raw_cols| raw_cols.split(',').map(|c| col(c.trim())).collect()) + .map(Ok) + .transpose() + } + + fn contains_connector_property(options: &[SqlOption]) -> bool { + options.iter().any(|opt| match opt { + SqlOption::KeyValue { key, .. } => key.value.eq_ignore_ascii_case(opt::CONNECTOR), + _ => false, + }) + } + + fn parse_primary_keys(constraints: &[TableConstraint]) -> Result> { + let mut keys = None; + for constraint in constraints { + if let TableConstraint::PrimaryKey { columns, .. } = constraint { + if keys.is_some() { + return plan_err!( + "Constraint Violation: Multiple PRIMARY KEY constraints are forbidden" + ); + } + keys = Some(columns.iter().map(|ident| ident.value.clone()).collect()); + } + } + Ok(keys.unwrap_or_default()) + } + + fn parse_watermark_strategy( + constraints: &[TableConstraint], + ) -> Result)>> { + let mut strategy = None; + for constraint in constraints { + if let TableConstraint::Watermark { + column_name, + watermark_expr, + } = constraint + { + if strategy.is_some() { + return plan_err!( + "Constraint Violation: Only a single WATERMARK FOR clause is permitted" + ); + } + strategy = Some((column_name.value.clone(), watermark_expr.clone())); + } + } + Ok(strategy) + } + + fn compile_connector_source_plan(&self, stmt: &SqlCreateTable) -> Result { + if stmt.query.is_some() { + return plan_err!( + "Syntax Error: CREATE TABLE ... AS SELECT combined with WITH ('connector'=...) is invalid. Use CREATE STREAMING TABLE instead." + ); + } + if stmt.or_replace { + return plan_err!( + "Syntax Error: OR REPLACE is not supported for external connector tables." + ); + } + if stmt.temporary { + return plan_err!( + "Syntax Error: TEMPORARY is not supported for external connector tables." + ); + } + if stmt.external { + return plan_err!( + "Syntax Error: EXTERNAL keyword is redundant and unsupported for connector configurations." + ); + } + + let target_name = stmt.name.to_string(); + let table_description = stmt + .comment + .clone() + .map(|c| c.to_string()) + .unwrap_or_default(); + + let schema_compiler = datafusion::sql::planner::SqlToRel::new(&self.schema_provider); + let arrow_schema = schema_compiler.build_schema(stmt.columns.clone())?; + + let schema_descriptors = arrow_schema + .fields() + .iter() + .map(|f| ColumnDescriptor::from((**f).clone())) + .collect::>(); + + let mut connector_options = ConnectorOptions::new(&stmt.with_options, &None)?; + let adapter_type = connector_options + .pull_opt_str(opt::CONNECTOR)? + .ok_or_else(|| { + plan_datafusion_err!( + "Configuration Error: Missing required property '{}' in WITH clause", + opt::CONNECTOR + ) + })?; + + let pk_constraints = Self::parse_primary_keys(&stmt.constraints)?; + let watermark_strategy = Self::parse_watermark_strategy(&stmt.constraints)?; + + let source_definition = SourceTable::from_options( + &target_name, + &adapter_type, + false, + schema_descriptors, + pk_constraints, + watermark_strategy, + &mut connector_options, + None, + &self.schema_provider, + Some(ConnectionType::Source), + table_description, + )?; + + Ok(CreateTablePlan::connector_source( + source_definition, + stmt.if_not_exists, + )) + } } impl StatementVisitor for LogicalPlanVisitor { fn visit_create_function( &self, stmt: &CreateFunction, - _context: &StatementVisitorContext, + _ctx: &StatementVisitorContext, ) -> StatementVisitorResult { - let function_source = stmt.get_function_source().clone(); - let config_source = stmt.get_config_source().cloned(); - let extra_props = stmt.get_extra_properties().clone(); - - // Name will be read from config file during execution StatementVisitorResult::Plan(Box::new(CreateFunctionPlan::new( - function_source, - config_source, - extra_props, + stmt.get_function_source().clone(), + stmt.get_config_source().cloned(), + stmt.get_extra_properties().clone(), ))) } fn visit_drop_function( &self, stmt: &DropFunction, - _context: &StatementVisitorContext, + _ctx: &StatementVisitorContext, ) -> StatementVisitorResult { StatementVisitorResult::Plan(Box::new(DropFunctionPlan::new(stmt.name.clone()))) } @@ -70,7 +348,7 @@ impl StatementVisitor for LogicalPlanVisitor { fn visit_start_function( &self, stmt: &StartFunction, - _context: &StatementVisitorContext, + _ctx: &StatementVisitorContext, ) -> StatementVisitorResult { StatementVisitorResult::Plan(Box::new(StartFunctionPlan::new(stmt.name.clone()))) } @@ -78,7 +356,7 @@ impl StatementVisitor for LogicalPlanVisitor { fn visit_stop_function( &self, stmt: &StopFunction, - _context: &StatementVisitorContext, + _ctx: &StatementVisitorContext, ) -> StatementVisitorResult { StatementVisitorResult::Plan(Box::new(StopFunctionPlan::new(stmt.name.clone()))) } @@ -86,24 +364,138 @@ impl StatementVisitor for LogicalPlanVisitor { fn visit_show_functions( &self, _stmt: &ShowFunctions, - _context: &StatementVisitorContext, + _ctx: &StatementVisitorContext, ) -> StatementVisitorResult { StatementVisitorResult::Plan(Box::new(ShowFunctionsPlan::new())) } + fn visit_show_catalog_tables( + &self, + _stmt: &ShowCatalogTables, + _ctx: &StatementVisitorContext, + ) -> StatementVisitorResult { + StatementVisitorResult::Plan(Box::new(ShowCatalogTablesPlan::new())) + } + + fn visit_show_create_table( + &self, + stmt: &ShowCreateTable, + _ctx: &StatementVisitorContext, + ) -> StatementVisitorResult { + StatementVisitorResult::Plan(Box::new(ShowCreateTablePlan::new(stmt.table_name.clone()))) + } + fn visit_create_python_function( &self, stmt: &CreatePythonFunction, - _context: &StatementVisitorContext, + _ctx: &StatementVisitorContext, ) -> StatementVisitorResult { - let class_name = stmt.get_class_name().to_string(); - let modules = stmt.get_modules().to_vec(); - let config_content = stmt.get_config_content().to_string(); - StatementVisitorResult::Plan(Box::new(CreatePythonFunctionPlan::new( - class_name, - modules, - config_content, + stmt.get_class_name().to_string(), + stmt.get_modules().to_vec(), + stmt.get_config_content().to_string(), + ))) + } + + fn visit_create_table( + &self, + stmt: &CreateTable, + _ctx: &StatementVisitorContext, + ) -> StatementVisitorResult { + if let Statement::CreateTable(ast_node) = &stmt.statement + && ast_node.query.is_none() + && Self::contains_connector_property(&ast_node.with_options) + { + let execution_plan = + self.compile_connector_source_plan(ast_node) + .unwrap_or_else(|err| { + panic!( + "Fatal Compiler Error: Connector source resolution failed - {err:#}" + ); + }); + return StatementVisitorResult::Plan(Box::new(execution_plan)); + } + + let schema_compiler = datafusion::sql::planner::SqlToRel::new(&self.schema_provider); + match schema_compiler.sql_statement_to_plan(stmt.statement.clone()) { + Ok(logical_plan) => { + debug!( + "Successfully compiled logical DDL topology:\n{}", + logical_plan.display_graphviz() + ); + StatementVisitorResult::Plan(Box::new(CreateTablePlan::new(logical_plan))) + } + Err(err) => panic!("Fatal Compiler Error: Logical plan translation failed - {err}"), + } + } + + fn visit_streaming_table_statement( + &self, + stmt: &StreamingTableStatement, + _ctx: &StatementVisitorContext, + ) -> StatementVisitorResult { + let execution_plan = self.compile_streaming_sink(stmt).unwrap_or_else(|err| { + panic!("Fatal Compiler Error: Streaming sink compilation aborted - {err}"); + }); + StatementVisitorResult::Plan(Box::new(execution_plan)) + } + + fn visit_drop_table_statement( + &self, + stmt: &DropTableStatement, + _ctx: &StatementVisitorContext, + ) -> StatementVisitorResult { + let DFStatement::Drop { + object_type, + if_exists, + names, + .. + } = &stmt.statement + else { + panic!("Fatal Compiler Error: AST mismatch on DropTableStatement"); + }; + + if *object_type != ObjectType::Table { + panic!("Fatal Compiler Error: Drop target must be of type TABLE"); + } + if names.len() != 1 { + panic!( + "Fatal Compiler Error: Bulk drop operations are not supported. Specify exactly one table." + ); + } + + StatementVisitorResult::Plan(Box::new(DropTablePlan::new( + names[0].to_string(), + *if_exists, + ))) + } + + fn visit_show_streaming_tables( + &self, + _stmt: &ShowStreamingTables, + _ctx: &StatementVisitorContext, + ) -> StatementVisitorResult { + StatementVisitorResult::Plan(Box::new(ShowStreamingTablesPlan::new())) + } + + fn visit_show_create_streaming_table( + &self, + stmt: &ShowCreateStreamingTable, + _ctx: &StatementVisitorContext, + ) -> StatementVisitorResult { + StatementVisitorResult::Plan(Box::new(ShowCreateStreamingTablePlan::new( + stmt.table_name.clone(), + ))) + } + + fn visit_drop_streaming_table( + &self, + stmt: &DropStreamingTableStatement, + _ctx: &StatementVisitorContext, + ) -> StatementVisitorResult { + StatementVisitorResult::Plan(Box::new(DropStreamingTablePlan::new( + stmt.table_name.clone(), + stmt.if_exists, ))) } } diff --git a/src/coordinator/plan/lookup_table_plan.rs b/src/coordinator/plan/lookup_table_plan.rs new file mode 100644 index 00000000..65103b61 --- /dev/null +++ b/src/coordinator/plan/lookup_table_plan.rs @@ -0,0 +1,27 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::sql::schema::source_table::SourceTable; + +use super::{PlanNode, PlanVisitor, PlanVisitorContext, PlanVisitorResult}; + +/// Plan node that exposes a lookup table config as a logical plan input. +#[derive(Debug)] +pub struct LookupTablePlan { + pub table: SourceTable, +} + +impl PlanNode for LookupTablePlan { + fn accept(&self, visitor: &dyn PlanVisitor, context: &PlanVisitorContext) -> PlanVisitorResult { + visitor.visit_lookup_table(self, context) + } +} diff --git a/src/coordinator/plan/mod.rs b/src/coordinator/plan/mod.rs index 9aa403b5..8166d444 100644 --- a/src/coordinator/plan/mod.rs +++ b/src/coordinator/plan/mod.rs @@ -12,22 +12,42 @@ mod create_function_plan; mod create_python_function_plan; +mod create_table_plan; mod drop_function_plan; +mod drop_streaming_table_plan; +mod drop_table_plan; mod logical_plan_visitor; +mod lookup_table_plan; mod optimizer; +mod show_catalog_tables_plan; +mod show_create_streaming_table_plan; +mod show_create_table_plan; mod show_functions_plan; +mod show_streaming_tables_plan; mod start_function_plan; mod stop_function_plan; +mod streaming_table_connector_plan; +mod streaming_table_plan; mod visitor; pub use create_function_plan::CreateFunctionPlan; pub use create_python_function_plan::CreatePythonFunctionPlan; +pub use create_table_plan::{CreateTablePlan, CreateTablePlanBody}; pub use drop_function_plan::DropFunctionPlan; +pub use drop_streaming_table_plan::DropStreamingTablePlan; +pub use drop_table_plan::DropTablePlan; pub use logical_plan_visitor::LogicalPlanVisitor; +pub use lookup_table_plan::LookupTablePlan; pub use optimizer::LogicalPlanner; +pub use show_catalog_tables_plan::ShowCatalogTablesPlan; +pub use show_create_streaming_table_plan::ShowCreateStreamingTablePlan; +pub use show_create_table_plan::ShowCreateTablePlan; pub use show_functions_plan::ShowFunctionsPlan; +pub use show_streaming_tables_plan::ShowStreamingTablesPlan; pub use start_function_plan::StartFunctionPlan; pub use stop_function_plan::StopFunctionPlan; +pub use streaming_table_connector_plan::StreamingTableConnectorPlan; +pub use streaming_table_plan::StreamingTable; pub use visitor::{PlanVisitor, PlanVisitorContext, PlanVisitorResult}; use std::fmt; diff --git a/src/coordinator/plan/show_catalog_tables_plan.rs b/src/coordinator/plan/show_catalog_tables_plan.rs new file mode 100644 index 00000000..420fdb40 --- /dev/null +++ b/src/coordinator/plan/show_catalog_tables_plan.rs @@ -0,0 +1,28 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::{PlanNode, PlanVisitor, PlanVisitorContext, PlanVisitorResult}; + +#[derive(Debug, Default)] +pub struct ShowCatalogTablesPlan; + +impl ShowCatalogTablesPlan { + pub fn new() -> Self { + Self + } +} + +impl PlanNode for ShowCatalogTablesPlan { + fn accept(&self, visitor: &dyn PlanVisitor, context: &PlanVisitorContext) -> PlanVisitorResult { + visitor.visit_show_catalog_tables(self, context) + } +} diff --git a/src/coordinator/plan/show_create_streaming_table_plan.rs b/src/coordinator/plan/show_create_streaming_table_plan.rs new file mode 100644 index 00000000..8d63c0d5 --- /dev/null +++ b/src/coordinator/plan/show_create_streaming_table_plan.rs @@ -0,0 +1,30 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::{PlanNode, PlanVisitor, PlanVisitorContext, PlanVisitorResult}; + +#[derive(Debug, Clone)] +pub struct ShowCreateStreamingTablePlan { + pub table_name: String, +} + +impl ShowCreateStreamingTablePlan { + pub fn new(table_name: String) -> Self { + Self { table_name } + } +} + +impl PlanNode for ShowCreateStreamingTablePlan { + fn accept(&self, visitor: &dyn PlanVisitor, context: &PlanVisitorContext) -> PlanVisitorResult { + visitor.visit_show_create_streaming_table(self, context) + } +} diff --git a/src/coordinator/plan/show_create_table_plan.rs b/src/coordinator/plan/show_create_table_plan.rs new file mode 100644 index 00000000..c5fe6376 --- /dev/null +++ b/src/coordinator/plan/show_create_table_plan.rs @@ -0,0 +1,30 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::{PlanNode, PlanVisitor, PlanVisitorContext, PlanVisitorResult}; + +#[derive(Debug, Clone)] +pub struct ShowCreateTablePlan { + pub table_name: String, +} + +impl ShowCreateTablePlan { + pub fn new(table_name: String) -> Self { + Self { table_name } + } +} + +impl PlanNode for ShowCreateTablePlan { + fn accept(&self, visitor: &dyn PlanVisitor, context: &PlanVisitorContext) -> PlanVisitorResult { + visitor.visit_show_create_table(self, context) + } +} diff --git a/src/coordinator/plan/show_streaming_tables_plan.rs b/src/coordinator/plan/show_streaming_tables_plan.rs new file mode 100644 index 00000000..08410115 --- /dev/null +++ b/src/coordinator/plan/show_streaming_tables_plan.rs @@ -0,0 +1,28 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::{PlanNode, PlanVisitor, PlanVisitorContext, PlanVisitorResult}; + +#[derive(Debug, Default)] +pub struct ShowStreamingTablesPlan; + +impl ShowStreamingTablesPlan { + pub fn new() -> Self { + Self + } +} + +impl PlanNode for ShowStreamingTablesPlan { + fn accept(&self, visitor: &dyn PlanVisitor, context: &PlanVisitorContext) -> PlanVisitorResult { + visitor.visit_show_streaming_tables(self, context) + } +} diff --git a/src/coordinator/plan/streaming_table_connector_plan.rs b/src/coordinator/plan/streaming_table_connector_plan.rs new file mode 100644 index 00000000..214e2e15 --- /dev/null +++ b/src/coordinator/plan/streaming_table_connector_plan.rs @@ -0,0 +1,27 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::sql::schema::source_table::SourceTable; + +use super::{PlanNode, PlanVisitor, PlanVisitorContext, PlanVisitorResult}; + +/// Plan node that exposes a connector table config as a logical plan input. +#[derive(Debug)] +pub struct StreamingTableConnectorPlan { + pub table: SourceTable, +} + +impl PlanNode for StreamingTableConnectorPlan { + fn accept(&self, visitor: &dyn PlanVisitor, context: &PlanVisitorContext) -> PlanVisitorResult { + visitor.visit_streaming_connector_table(self, context) + } +} diff --git a/src/coordinator/plan/streaming_table_plan.rs b/src/coordinator/plan/streaming_table_plan.rs new file mode 100644 index 00000000..512ec266 --- /dev/null +++ b/src/coordinator/plan/streaming_table_plan.rs @@ -0,0 +1,28 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::{PlanNode, PlanVisitor, PlanVisitorContext, PlanVisitorResult}; +use crate::sql::logical_node::logical::LogicalProgram; + +/// Plan node representing a fully resolved streaming table (DDL). +#[derive(Debug)] +pub struct StreamingTable { + pub name: String, + pub comment: Option, + pub program: LogicalProgram, +} + +impl PlanNode for StreamingTable { + fn accept(&self, visitor: &dyn PlanVisitor, context: &PlanVisitorContext) -> PlanVisitorResult { + visitor.visit_streaming_table(self, context) + } +} diff --git a/src/coordinator/plan/visitor.rs b/src/coordinator/plan/visitor.rs index 44059c67..a94d761f 100644 --- a/src/coordinator/plan/visitor.rs +++ b/src/coordinator/plan/visitor.rs @@ -11,8 +11,10 @@ // limitations under the License. use super::{ - CreateFunctionPlan, CreatePythonFunctionPlan, DropFunctionPlan, ShowFunctionsPlan, - StartFunctionPlan, StopFunctionPlan, + CreateFunctionPlan, CreatePythonFunctionPlan, CreateTablePlan, DropFunctionPlan, + DropStreamingTablePlan, DropTablePlan, LookupTablePlan, ShowCatalogTablesPlan, + ShowCreateStreamingTablePlan, ShowCreateTablePlan, ShowFunctionsPlan, ShowStreamingTablesPlan, + StartFunctionPlan, StopFunctionPlan, StreamingTable, StreamingTableConnectorPlan, }; /// Context passed to PlanVisitor methods @@ -79,9 +81,69 @@ pub trait PlanVisitor { context: &PlanVisitorContext, ) -> PlanVisitorResult; + fn visit_show_catalog_tables( + &self, + plan: &ShowCatalogTablesPlan, + context: &PlanVisitorContext, + ) -> PlanVisitorResult; + + fn visit_show_create_table( + &self, + plan: &ShowCreateTablePlan, + context: &PlanVisitorContext, + ) -> PlanVisitorResult; + fn visit_create_python_function( &self, plan: &CreatePythonFunctionPlan, context: &PlanVisitorContext, ) -> PlanVisitorResult; + + fn visit_create_table_plan( + &self, + plan: &CreateTablePlan, + context: &PlanVisitorContext, + ) -> PlanVisitorResult; + + fn visit_streaming_table( + &self, + plan: &StreamingTable, + context: &PlanVisitorContext, + ) -> PlanVisitorResult; + + fn visit_lookup_table( + &self, + plan: &LookupTablePlan, + context: &PlanVisitorContext, + ) -> PlanVisitorResult; + + fn visit_streaming_connector_table( + &self, + plan: &StreamingTableConnectorPlan, + context: &PlanVisitorContext, + ) -> PlanVisitorResult; + + fn visit_drop_table_plan( + &self, + plan: &DropTablePlan, + context: &PlanVisitorContext, + ) -> PlanVisitorResult; + + fn visit_show_streaming_tables( + &self, + plan: &ShowStreamingTablesPlan, + context: &PlanVisitorContext, + ) -> PlanVisitorResult; + + fn visit_show_create_streaming_table( + &self, + plan: &ShowCreateStreamingTablePlan, + context: &PlanVisitorContext, + ) -> PlanVisitorResult; + + fn visit_drop_streaming_table( + &self, + plan: &DropStreamingTablePlan, + context: &PlanVisitorContext, + ) -> PlanVisitorResult; } diff --git a/src/coordinator/runtime_context.rs b/src/coordinator/runtime_context.rs new file mode 100644 index 00000000..5d671b98 --- /dev/null +++ b/src/coordinator/runtime_context.rs @@ -0,0 +1,61 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Runtime resources for a single coordinator run: [`TaskManager`], [`CatalogManager`], and [`JobManager`]. + +use std::sync::Arc; + +use anyhow::Result; + +use crate::runtime::streaming::job::JobManager; +use crate::runtime::taskexecutor::TaskManager; +use crate::sql::schema::StreamSchemaProvider; +use crate::storage::stream_catalog::CatalogManager; + +/// Dependencies shared by analyze / plan / execute, analogous to installing globals in +/// [`TaskManager`], [`CatalogManager`], and [`JobManager`]. +#[derive(Clone)] +pub struct CoordinatorRuntimeContext { + pub task_manager: Arc, + pub catalog_manager: Arc, + pub job_manager: Arc, +} + +impl CoordinatorRuntimeContext { + pub fn try_from_globals() -> Result { + Ok(Self { + task_manager: TaskManager::get() + .map_err(|e| anyhow::anyhow!("Failed to get TaskManager: {}", e))?, + catalog_manager: CatalogManager::global() + .map_err(|e| anyhow::anyhow!("Failed to get CatalogManager: {}", e))?, + job_manager: JobManager::global() + .map_err(|e| anyhow::anyhow!("Failed to get JobManager: {}", e))?, + }) + } + + pub fn new( + task_manager: Arc, + catalog_manager: Arc, + job_manager: Arc, + ) -> Self { + Self { + task_manager, + catalog_manager, + job_manager, + } + } + + /// Schema provider for [`LogicalPlanVisitor`] / [`SqlToRel`]. + pub fn planning_schema_provider(&self) -> StreamSchemaProvider { + self.catalog_manager.acquire_planning_context() + } +} diff --git a/src/coordinator/statement/create_table.rs b/src/coordinator/statement/create_table.rs new file mode 100644 index 00000000..67a500d1 --- /dev/null +++ b/src/coordinator/statement/create_table.rs @@ -0,0 +1,44 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion::sql::sqlparser::ast::Statement as DFStatement; + +use super::{Statement, StatementVisitor, StatementVisitorContext, StatementVisitorResult}; + +/// Represents a CREATE TABLE or CREATE VIEW statement. +/// +/// This wraps the raw SQL AST node so the coordinator pipeline can +/// distinguish table/view creation from other streaming SQL operations. +#[derive(Debug)] +pub struct CreateTable { + pub statement: DFStatement, +} + +impl CreateTable { + pub fn new(statement: DFStatement) -> Self { + Self { statement } + } +} + +impl Statement for CreateTable { + fn accept( + &self, + visitor: &dyn StatementVisitor, + context: &StatementVisitorContext, + ) -> StatementVisitorResult { + visitor.visit_create_table(self, context) + } + + fn as_create_table(&self) -> Option<&CreateTable> { + Some(self) + } +} diff --git a/src/coordinator/statement/drop_streaming_table.rs b/src/coordinator/statement/drop_streaming_table.rs new file mode 100644 index 00000000..309abd97 --- /dev/null +++ b/src/coordinator/statement/drop_streaming_table.rs @@ -0,0 +1,40 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::{Statement, StatementVisitor, StatementVisitorContext, StatementVisitorResult}; + +/// `DROP STREAMING TABLE [IF EXISTS] ` — stops and removes the streaming +/// job from `JobManager`, then drops the corresponding catalog entry if present. +#[derive(Debug, Clone)] +pub struct DropStreamingTableStatement { + pub table_name: String, + pub if_exists: bool, +} + +impl DropStreamingTableStatement { + pub fn new(table_name: String, if_exists: bool) -> Self { + Self { + table_name, + if_exists, + } + } +} + +impl Statement for DropStreamingTableStatement { + fn accept( + &self, + visitor: &dyn StatementVisitor, + context: &StatementVisitorContext, + ) -> StatementVisitorResult { + visitor.visit_drop_streaming_table(self, context) + } +} diff --git a/src/coordinator/statement/drop_table.rs b/src/coordinator/statement/drop_table.rs new file mode 100644 index 00000000..fa547dca --- /dev/null +++ b/src/coordinator/statement/drop_table.rs @@ -0,0 +1,41 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion::sql::sqlparser::ast::Statement as DFStatement; + +use super::{Statement, StatementVisitor, StatementVisitorContext, StatementVisitorResult}; + +/// `DROP TABLE` / `DROP TABLE IF EXISTS` (and `DROP STREAMING TABLE`, normalized at parse time). +#[derive(Debug, Clone)] +pub struct DropTableStatement { + pub statement: DFStatement, +} + +impl DropTableStatement { + pub fn new(statement: DFStatement) -> Self { + Self { statement } + } +} + +impl Statement for DropTableStatement { + fn accept( + &self, + visitor: &dyn StatementVisitor, + context: &StatementVisitorContext, + ) -> StatementVisitorResult { + visitor.visit_drop_table_statement(self, context) + } + + fn as_drop_table_statement(&self) -> Option<&DropTableStatement> { + Some(self) + } +} diff --git a/src/coordinator/statement/mod.rs b/src/coordinator/statement/mod.rs index f887209c..80d9c320 100644 --- a/src/coordinator/statement/mod.rs +++ b/src/coordinator/statement/mod.rs @@ -12,18 +12,34 @@ mod create_function; mod create_python_function; +mod create_table; mod drop_function; +mod drop_streaming_table; +mod drop_table; +mod show_catalog_tables; +mod show_create_streaming_table; +mod show_create_table; mod show_functions; +mod show_streaming_tables; mod start_function; mod stop_function; +mod streaming_table; mod visitor; pub use create_function::{ConfigSource, CreateFunction, FunctionSource}; pub use create_python_function::{CreatePythonFunction, PythonModule}; +pub use create_table::CreateTable; pub use drop_function::DropFunction; +pub use drop_streaming_table::DropStreamingTableStatement; +pub use drop_table::DropTableStatement; +pub use show_catalog_tables::ShowCatalogTables; +pub use show_create_streaming_table::ShowCreateStreamingTable; +pub use show_create_table::ShowCreateTable; pub use show_functions::ShowFunctions; +pub use show_streaming_tables::ShowStreamingTables; pub use start_function::StartFunction; pub use stop_function::StopFunction; +pub use streaming_table::StreamingTableStatement; pub use visitor::{StatementVisitor, StatementVisitorContext, StatementVisitorResult}; use std::fmt; @@ -34,4 +50,20 @@ pub trait Statement: fmt::Debug + Send + Sync { visitor: &dyn StatementVisitor, context: &StatementVisitorContext, ) -> StatementVisitorResult; + + fn as_create_table(&self) -> Option<&CreateTable> { + None + } + + fn as_drop_table_statement(&self) -> Option<&DropTableStatement> { + None + } + + fn as_streaming_table_statement(&self) -> Option<&StreamingTableStatement> { + None + } + + fn as_drop_streaming_table_statement(&self) -> Option<&DropStreamingTableStatement> { + None + } } diff --git a/src/coordinator/statement/show_catalog_tables.rs b/src/coordinator/statement/show_catalog_tables.rs new file mode 100644 index 00000000..1f034562 --- /dev/null +++ b/src/coordinator/statement/show_catalog_tables.rs @@ -0,0 +1,33 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::{Statement, StatementVisitor, StatementVisitorContext, StatementVisitorResult}; + +/// `SHOW TABLES` over the stream catalog (connector sources + streaming sinks). +#[derive(Debug, Clone, Default)] +pub struct ShowCatalogTables; + +impl ShowCatalogTables { + pub fn new() -> Self { + Self + } +} + +impl Statement for ShowCatalogTables { + fn accept( + &self, + visitor: &dyn StatementVisitor, + context: &StatementVisitorContext, + ) -> StatementVisitorResult { + visitor.visit_show_catalog_tables(self, context) + } +} diff --git a/src/coordinator/statement/show_create_streaming_table.rs b/src/coordinator/statement/show_create_streaming_table.rs new file mode 100644 index 00000000..73f16870 --- /dev/null +++ b/src/coordinator/statement/show_create_streaming_table.rs @@ -0,0 +1,36 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::{Statement, StatementVisitor, StatementVisitorContext, StatementVisitorResult}; + +/// `SHOW CREATE STREAMING TABLE ` — displays the pipeline topology and +/// runtime metadata for the named streaming job. +#[derive(Debug, Clone)] +pub struct ShowCreateStreamingTable { + pub table_name: String, +} + +impl ShowCreateStreamingTable { + pub fn new(table_name: String) -> Self { + Self { table_name } + } +} + +impl Statement for ShowCreateStreamingTable { + fn accept( + &self, + visitor: &dyn StatementVisitor, + context: &StatementVisitorContext, + ) -> StatementVisitorResult { + visitor.visit_show_create_streaming_table(self, context) + } +} diff --git a/src/coordinator/statement/show_create_table.rs b/src/coordinator/statement/show_create_table.rs new file mode 100644 index 00000000..5b54a726 --- /dev/null +++ b/src/coordinator/statement/show_create_table.rs @@ -0,0 +1,35 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::{Statement, StatementVisitor, StatementVisitorContext, StatementVisitorResult}; + +/// `SHOW CREATE TABLE ` for a stream-catalog table. +#[derive(Debug, Clone)] +pub struct ShowCreateTable { + pub table_name: String, +} + +impl ShowCreateTable { + pub fn new(table_name: String) -> Self { + Self { table_name } + } +} + +impl Statement for ShowCreateTable { + fn accept( + &self, + visitor: &dyn StatementVisitor, + context: &StatementVisitorContext, + ) -> StatementVisitorResult { + visitor.visit_show_create_table(self, context) + } +} diff --git a/src/coordinator/statement/show_streaming_tables.rs b/src/coordinator/statement/show_streaming_tables.rs new file mode 100644 index 00000000..cedf3610 --- /dev/null +++ b/src/coordinator/statement/show_streaming_tables.rs @@ -0,0 +1,33 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::{Statement, StatementVisitor, StatementVisitorContext, StatementVisitorResult}; + +/// `SHOW STREAMING TABLES` — lists all active streaming jobs managed by `JobManager`. +#[derive(Debug, Clone, Default)] +pub struct ShowStreamingTables; + +impl ShowStreamingTables { + pub fn new() -> Self { + Self + } +} + +impl Statement for ShowStreamingTables { + fn accept( + &self, + visitor: &dyn StatementVisitor, + context: &StatementVisitorContext, + ) -> StatementVisitorResult { + visitor.visit_show_streaming_tables(self, context) + } +} diff --git a/src/coordinator/statement/streaming_table.rs b/src/coordinator/statement/streaming_table.rs new file mode 100644 index 00000000..bfef3503 --- /dev/null +++ b/src/coordinator/statement/streaming_table.rs @@ -0,0 +1,44 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion::sql::sqlparser::ast::Statement as DFStatement; + +use super::{Statement, StatementVisitor, StatementVisitorContext, StatementVisitorResult}; + +/// Wrapper for **`CREATE STREAMING TABLE ... WITH (...) AS SELECT ...`** (parsed AST). +/// +/// The coordinator `parse_sql` frontend does **not** support `INSERT`; streaming sinks are +/// defined only via **`CREATE STREAMING TABLE`** (and regular tables via **`CREATE TABLE`**). +#[derive(Debug)] +pub struct StreamingTableStatement { + pub statement: DFStatement, +} + +impl StreamingTableStatement { + pub fn new(statement: DFStatement) -> Self { + Self { statement } + } +} + +impl Statement for StreamingTableStatement { + fn accept( + &self, + visitor: &dyn StatementVisitor, + context: &StatementVisitorContext, + ) -> StatementVisitorResult { + visitor.visit_streaming_table_statement(self, context) + } + + fn as_streaming_table_statement(&self) -> Option<&StreamingTableStatement> { + Some(self) + } +} diff --git a/src/coordinator/statement/visitor.rs b/src/coordinator/statement/visitor.rs index 13ce2cfc..48543127 100644 --- a/src/coordinator/statement/visitor.rs +++ b/src/coordinator/statement/visitor.rs @@ -11,7 +11,9 @@ // limitations under the License. use super::{ - CreateFunction, CreatePythonFunction, DropFunction, ShowFunctions, StartFunction, StopFunction, + CreateFunction, CreatePythonFunction, CreateTable, DropFunction, DropStreamingTableStatement, + DropTableStatement, ShowCatalogTables, ShowCreateStreamingTable, ShowCreateTable, + ShowFunctions, ShowStreamingTables, StartFunction, StopFunction, StreamingTableStatement, }; use crate::coordinator::plan::PlanNode; use crate::coordinator::statement::Statement; @@ -82,9 +84,57 @@ pub trait StatementVisitor { context: &StatementVisitorContext, ) -> StatementVisitorResult; + fn visit_show_catalog_tables( + &self, + stmt: &ShowCatalogTables, + context: &StatementVisitorContext, + ) -> StatementVisitorResult; + + fn visit_show_create_table( + &self, + stmt: &ShowCreateTable, + context: &StatementVisitorContext, + ) -> StatementVisitorResult; + fn visit_create_python_function( &self, stmt: &CreatePythonFunction, context: &StatementVisitorContext, ) -> StatementVisitorResult; + + fn visit_create_table( + &self, + stmt: &CreateTable, + context: &StatementVisitorContext, + ) -> StatementVisitorResult; + + fn visit_streaming_table_statement( + &self, + stmt: &StreamingTableStatement, + context: &StatementVisitorContext, + ) -> StatementVisitorResult; + + fn visit_drop_table_statement( + &self, + stmt: &DropTableStatement, + context: &StatementVisitorContext, + ) -> StatementVisitorResult; + + fn visit_show_streaming_tables( + &self, + stmt: &ShowStreamingTables, + context: &StatementVisitorContext, + ) -> StatementVisitorResult; + + fn visit_show_create_streaming_table( + &self, + stmt: &ShowCreateStreamingTable, + context: &StatementVisitorContext, + ) -> StatementVisitorResult; + + fn visit_drop_streaming_table( + &self, + stmt: &DropStreamingTableStatement, + context: &StatementVisitorContext, + ) -> StatementVisitorResult; } diff --git a/src/runtime/sink/mod.rs b/src/coordinator/tool/mod.rs similarity index 91% rename from src/runtime/sink/mod.rs rename to src/coordinator/tool/mod.rs index a0a2a6fc..6b48aa0e 100644 --- a/src/runtime/sink/mod.rs +++ b/src/coordinator/tool/mod.rs @@ -10,6 +10,4 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Sink module - -// TODO: Add sink implementation here +pub use crate::sql::common::ConnectorOptions; diff --git a/src/main.rs b/src/main.rs index 562b1526..46da3c7a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -179,10 +179,7 @@ fn main() -> Result<()> { ); // 2. Component Initialization - let registry = server::register_components(); - registry - .initialize_all(&config) - .context("Component initialization failed")?; + server::bootstrap_system(&config).context("Component initialization failed")?; // 3. Server Startup let mut server_handle = spawn_server_thread(config.clone())?; diff --git a/src/runtime/mod.rs b/src/runtime/mod.rs index f69ad017..1ba5e2a3 100644 --- a/src/runtime/mod.rs +++ b/src/runtime/mod.rs @@ -14,10 +14,12 @@ pub mod buffer_and_event; pub mod common; -pub mod input; -pub mod output; -pub mod processor; -pub mod sink; -pub mod source; +pub mod streaming; pub mod task; pub mod taskexecutor; +pub mod util; +pub mod wasm; + +pub use wasm::input; +pub use wasm::output; +pub use wasm::processor; diff --git a/src/runtime/streaming/api/context.rs b/src/runtime/streaming/api/context.rs new file mode 100644 index 00000000..f9dc805e --- /dev/null +++ b/src/runtime/streaming/api/context.rs @@ -0,0 +1,200 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; +use std::time::{Duration, SystemTime}; + +use anyhow::{Context, Result}; +use arrow_array::RecordBatch; + +use crate::runtime::streaming::memory::MemoryPool; +use crate::runtime::streaming::network::endpoint::PhysicalSender; +use crate::runtime::streaming::protocol::event::{StreamEvent, TrackedEvent}; + +#[derive(Debug, Clone)] +pub struct TaskContextConfig { + pub source_idle_timeout: Duration, +} + +impl Default for TaskContextConfig { + fn default() -> Self { + Self { + source_idle_timeout: Duration::from_millis(50), + } + } +} + +/// Task execution context. +/// +/// Acts as the sole bridge between operators and engine infrastructure (network, memory, +/// configuration) for a single subtask. +pub struct TaskContext { + /// Job identifier. + pub job_id: String, + /// Logical pipeline (vertex) index within the job graph. + pub pipeline_id: u32, + /// This subtask's index within the pipeline's parallelism. + pub subtask_index: u32, + /// Number of parallel subtasks for this pipeline. + pub parallelism: u32, + + /// Precomputed display string for high-frequency logging without per-call allocation. + task_name: String, + + /// Downstream physical senders (outbound edges). + downstream_senders: Vec, + + /// Global memory pool for back-pressure and accounting. + memory_pool: Arc, + + /// Latest aligned event-time watermark for this subtask. + current_watermark: Option, + + /// Subtask-level tunables. + config: TaskContextConfig, +} + +impl TaskContext { + pub fn new( + job_id: String, + pipeline_id: u32, + subtask_index: u32, + parallelism: u32, + downstream_senders: Vec, + memory_pool: Arc, + ) -> Self { + let task_name = format!( + "Task-[{}]-Pipe[{}]-Sub[{}/{}]", + job_id, pipeline_id, subtask_index, parallelism + ); + + Self { + job_id, + pipeline_id, + subtask_index, + parallelism, + task_name, + downstream_senders, + memory_pool, + current_watermark: None, + config: TaskContextConfig::default(), + } + } + + #[inline] + pub fn config(&self) -> &TaskContextConfig { + &self.config + } + + #[inline] + pub fn task_name(&self) -> &str { + &self.task_name + } + + // ------------------------------------------------------------------------- + // Watermark + // ------------------------------------------------------------------------- + + #[inline] + pub fn current_watermark(&self) -> Option { + self.current_watermark + } + + pub fn advance_watermark(&mut self, watermark: SystemTime) { + if let Some(current) = self.current_watermark { + self.current_watermark = Some(current.max(watermark)); + } else { + self.current_watermark = Some(watermark); + } + } + + // ------------------------------------------------------------------------- + // Data emission + // ------------------------------------------------------------------------- + + /// Fan-out a data batch to all downstreams (forward / broadcast). + pub async fn collect(&self, batch: RecordBatch) -> Result<()> { + if self.downstream_senders.is_empty() { + return Ok(()); + } + + let bytes_required = batch.get_array_memory_size(); + let ticket = self.memory_pool.request_memory(bytes_required).await; + let tracked_event = TrackedEvent::new(StreamEvent::Data(batch), Some(ticket)); + + self.broadcast_event(tracked_event).await + } + + /// Route a batch to one downstream by hash partitioning (shuffle). + pub async fn collect_keyed(&self, key_hash: u64, batch: RecordBatch) -> Result<()> { + let num_downstreams = self.downstream_senders.len(); + if num_downstreams == 0 { + return Ok(()); + } + + let bytes_required = batch.get_array_memory_size(); + let ticket = self.memory_pool.request_memory(bytes_required).await; + let event = TrackedEvent::new(StreamEvent::Data(batch), Some(ticket)); + + let target_idx = (key_hash as usize) % num_downstreams; + + self.downstream_senders[target_idx] + .send(event) + .await + .with_context(|| { + format!( + "{} failed to route keyed data to downstream index {}", + self.task_name, target_idx + ) + })?; + + Ok(()) + } + + /// Broadcast a control event (watermark, barrier, end-of-stream). + pub async fn broadcast(&self, event: StreamEvent) -> Result<()> { + if self.downstream_senders.is_empty() { + return Ok(()); + } + let tracked_event = TrackedEvent::control(event); + self.broadcast_event(tracked_event).await + } + + // ------------------------------------------------------------------------- + // Internal dispatch + // ------------------------------------------------------------------------- + + async fn broadcast_event(&self, event: TrackedEvent) -> Result<()> { + let mut iter = self.downstream_senders.iter().enumerate().peekable(); + + while let Some((idx, sender)) = iter.next() { + if iter.peek().is_some() { + sender.send(event.clone()).await.with_context(|| { + format!( + "{} failed to broadcast event to downstream index {}", + self.task_name, idx + ) + })?; + } else { + sender.send(event).await.with_context(|| { + format!( + "{} failed to send final event to downstream index {}", + self.task_name, idx + ) + })?; + break; + } + } + + Ok(()) + } +} diff --git a/src/runtime/streaming/api/mod.rs b/src/runtime/streaming/api/mod.rs new file mode 100644 index 00000000..f858022d --- /dev/null +++ b/src/runtime/streaming/api/mod.rs @@ -0,0 +1,15 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod context; +pub mod operator; +pub mod source; diff --git a/src/runtime/streaming/api/operator.rs b/src/runtime/streaming/api/operator.rs new file mode 100644 index 00000000..df8f0dcb --- /dev/null +++ b/src/runtime/streaming/api/operator.rs @@ -0,0 +1,80 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::source::SourceOperator; +use crate::runtime::streaming::protocol::event::StreamOutput; +use crate::sql::common::{CheckpointBarrier, Watermark}; +use arrow_array::RecordBatch; +use async_trait::async_trait; +use std::time::Duration; + +// --------------------------------------------------------------------------- +// ConstructedOperator +// --------------------------------------------------------------------------- + +pub enum ConstructedOperator { + Source(Box), + Operator(Box), +} + +#[async_trait] +pub trait Operator: Send + 'static { + fn name(&self) -> &str; + + async fn on_start(&mut self, _ctx: &mut TaskContext) -> anyhow::Result<()> { + Ok(()) + } + + async fn process_data( + &mut self, + input_idx: usize, + batch: RecordBatch, + ctx: &mut TaskContext, + ) -> anyhow::Result>; + + async fn process_watermark( + &mut self, + watermark: Watermark, + ctx: &mut TaskContext, + ) -> anyhow::Result>; + + async fn snapshot_state( + &mut self, + barrier: CheckpointBarrier, + ctx: &mut TaskContext, + ) -> anyhow::Result<()>; + + async fn commit_checkpoint( + &mut self, + _epoch: u32, + _ctx: &mut TaskContext, + ) -> anyhow::Result<()> { + Ok(()) + } + + fn tick_interval(&self) -> Option { + None + } + + async fn process_tick( + &mut self, + _tick_index: u64, + _ctx: &mut TaskContext, + ) -> anyhow::Result> { + Ok(vec![]) + } + + async fn on_close(&mut self, _ctx: &mut TaskContext) -> anyhow::Result> { + Ok(vec![]) + } +} diff --git a/src/runtime/streaming/api/source.rs b/src/runtime/streaming/api/source.rs new file mode 100644 index 00000000..81435b47 --- /dev/null +++ b/src/runtime/streaming/api/source.rs @@ -0,0 +1,65 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::runtime::streaming::api::context::TaskContext; +use crate::sql::common::{CheckpointBarrier, Watermark}; +use arrow_array::RecordBatch; +use async_trait::async_trait; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub enum SourceOffset { + Earliest, + Latest, + #[default] + Group, +} + +#[derive(Debug)] +pub enum SourceEvent { + Data(RecordBatch), + Watermark(Watermark), + Idle, + EndOfStream, +} + +#[async_trait] +pub trait SourceOperator: Send + 'static { + fn name(&self) -> &str; + + async fn on_start(&mut self, _ctx: &mut TaskContext) -> anyhow::Result<()> { + Ok(()) + } + + async fn fetch_next(&mut self, ctx: &mut TaskContext) -> anyhow::Result; + + fn poll_watermark(&mut self) -> Option { + None + } + + async fn snapshot_state( + &mut self, + barrier: CheckpointBarrier, + ctx: &mut TaskContext, + ) -> anyhow::Result<()>; + + async fn commit_checkpoint( + &mut self, + _epoch: u32, + _ctx: &mut TaskContext, + ) -> anyhow::Result<()> { + Ok(()) + } + + async fn on_close(&mut self, _ctx: &mut TaskContext) -> anyhow::Result<()> { + Ok(()) + } +} diff --git a/src/runtime/streaming/error.rs b/src/runtime/streaming/error.rs new file mode 100644 index 00000000..ff850e82 --- /dev/null +++ b/src/runtime/streaming/error.rs @@ -0,0 +1,46 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Display; +use thiserror::Error; + +#[derive(Debug, Error)] +pub enum RunError { + #[error("Operator execution failed: {0:#}")] + Operator(#[from] anyhow::Error), + + #[error("Downstream send failed: {0}")] + DownstreamSend(String), + + #[error("Internal engine error: {0}")] + Internal(String), + + #[error("State backend error: {0}")] + State(String), + + #[error("I/O error: {0}")] + Io(#[from] std::io::Error), +} + +impl RunError { + pub fn internal(msg: T) -> Self { + Self::Internal(msg.to_string()) + } + + pub fn downstream(msg: T) -> Self { + Self::DownstreamSend(msg.to_string()) + } + + pub fn state(msg: T) -> Self { + Self::State(msg.to_string()) + } +} diff --git a/src/runtime/streaming/execution/mod.rs b/src/runtime/streaming/execution/mod.rs new file mode 100644 index 00000000..57765bea --- /dev/null +++ b/src/runtime/streaming/execution/mod.rs @@ -0,0 +1,20 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod operator_chain; +pub mod pipeline; +pub mod source_driver; +pub mod tracker; + +pub use operator_chain::{ChainBuilder, OperatorDrive}; +pub use pipeline::Pipeline; +pub use source_driver::SourceDriver; diff --git a/src/runtime/streaming/execution/operator_chain.rs b/src/runtime/streaming/execution/operator_chain.rs new file mode 100644 index 00000000..a2e6c5c6 --- /dev/null +++ b/src/runtime/streaming/execution/operator_chain.rs @@ -0,0 +1,290 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use async_trait::async_trait; + +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::Operator; +use crate::runtime::streaming::error::RunError; +use crate::runtime::streaming::protocol::{ + control::{ControlCommand, StopMode}, + event::{StreamEvent, StreamOutput, TrackedEvent}, +}; +use crate::sql::common::CheckpointBarrier; + +#[async_trait] +pub trait OperatorDrive: Send { + async fn on_start(&mut self, ctx: &mut TaskContext) -> Result<(), RunError>; + async fn process_event( + &mut self, + input_idx: usize, + event: TrackedEvent, + ctx: &mut TaskContext, + ) -> Result; + async fn handle_control( + &mut self, + cmd: ControlCommand, + ctx: &mut TaskContext, + ) -> Result; + async fn on_close(&mut self, ctx: &mut TaskContext) -> Result<(), RunError>; +} + +pub struct ChainBuilder; + +impl ChainBuilder { + pub fn build(mut operators: Vec>) -> Option> { + let tail_operator = operators.pop()?; + + let mut current_driver: Box = Box::new(TailDriver::new(tail_operator)); + + while let Some(op) = operators.pop() { + current_driver = Box::new(IntermediateDriver::new(op, current_driver)); + } + + Some(current_driver) + } +} + +pub struct IntermediateDriver { + operator: Box, + next: Box, +} + +impl IntermediateDriver { + pub fn new(operator: Box, next: Box) -> Self { + Self { operator, next } + } + + async fn dispatch_outputs( + &mut self, + outputs: Vec, + ctx: &mut TaskContext, + ) -> Result<(), RunError> { + for out in outputs { + match out { + StreamOutput::Forward(b) => { + self.next + .process_event(0, TrackedEvent::control(StreamEvent::Data(b)), ctx) + .await?; + } + StreamOutput::Watermark(wm) => { + self.next + .process_event(0, TrackedEvent::control(StreamEvent::Watermark(wm)), ctx) + .await?; + } + StreamOutput::Keyed(_, _) | StreamOutput::Broadcast(_) => { + return Err(RunError::internal(format!( + "Topology Violation: Keyed or Broadcast output emitted in the middle of chain by '{}'", + self.operator.name() + ))); + } + } + } + Ok(()) + } + + async fn forward_signal( + &mut self, + event: StreamEvent, + ctx: &mut TaskContext, + ) -> Result<(), RunError> { + self.next + .process_event(0, TrackedEvent::control(event), ctx) + .await + .map(|_| ()) + } +} + +#[async_trait] +impl OperatorDrive for IntermediateDriver { + async fn on_start(&mut self, ctx: &mut TaskContext) -> Result<(), RunError> { + self.operator.on_start(ctx).await?; + self.next.on_start(ctx).await?; + Ok(()) + } + + async fn process_event( + &mut self, + input_idx: usize, + tracked: TrackedEvent, + ctx: &mut TaskContext, + ) -> Result { + match tracked.event { + StreamEvent::Data(batch) => { + let outputs = self.operator.process_data(input_idx, batch, ctx).await?; + self.dispatch_outputs(outputs, ctx).await?; + Ok(false) + } + StreamEvent::Watermark(wm) => { + let outputs = self.operator.process_watermark(wm, ctx).await?; + self.dispatch_outputs(outputs, ctx).await?; + self.forward_signal(StreamEvent::Watermark(wm), ctx).await?; + Ok(false) + } + StreamEvent::Barrier(barrier) => { + self.operator.snapshot_state(barrier, ctx).await?; + self.forward_signal(StreamEvent::Barrier(barrier), ctx) + .await?; + Ok(false) + } + StreamEvent::EndOfStream => { + self.forward_signal(StreamEvent::EndOfStream, ctx).await?; + Ok(true) + } + } + } + + async fn handle_control( + &mut self, + cmd: ControlCommand, + ctx: &mut TaskContext, + ) -> Result { + let mut stop = false; + + match &cmd { + ControlCommand::TriggerCheckpoint { barrier } => { + let b: CheckpointBarrier = barrier.clone().into(); + self.operator.snapshot_state(b, ctx).await?; + } + ControlCommand::Commit { epoch } => { + self.operator.commit_checkpoint(*epoch, ctx).await?; + } + ControlCommand::Stop { mode } if *mode == StopMode::Immediate => { + stop = true; + } + _ => {} + } + + if self.next.handle_control(cmd, ctx).await? { + stop = true; + } + + Ok(stop) + } + + async fn on_close(&mut self, ctx: &mut TaskContext) -> Result<(), RunError> { + let close_outs = self.operator.on_close(ctx).await?; + self.dispatch_outputs(close_outs, ctx).await?; + self.next.on_close(ctx).await?; + Ok(()) + } +} + +pub struct TailDriver { + operator: Box, +} + +impl TailDriver { + pub fn new(operator: Box) -> Self { + Self { operator } + } + + async fn dispatch_outputs( + &mut self, + outputs: Vec, + ctx: &mut TaskContext, + ) -> Result<(), RunError> { + for out in outputs { + match out { + StreamOutput::Forward(b) => ctx.collect(b).await?, + StreamOutput::Keyed(hash, b) => ctx.collect_keyed(hash, b).await?, + StreamOutput::Broadcast(b) => ctx.collect(b).await?, + StreamOutput::Watermark(wm) => ctx.broadcast(StreamEvent::Watermark(wm)).await?, + } + } + Ok(()) + } + + async fn forward_signal( + &mut self, + event: StreamEvent, + ctx: &mut TaskContext, + ) -> Result<(), RunError> { + match event { + StreamEvent::Watermark(wm) => ctx.broadcast(StreamEvent::Watermark(wm)).await?, + StreamEvent::Barrier(b) => ctx.broadcast(StreamEvent::Barrier(b)).await?, + StreamEvent::EndOfStream => ctx.broadcast(StreamEvent::EndOfStream).await?, + StreamEvent::Data(_) => unreachable!("Data signal should not be forwarded implicitly"), + } + Ok(()) + } +} + +#[async_trait] +impl OperatorDrive for TailDriver { + async fn on_start(&mut self, ctx: &mut TaskContext) -> Result<(), RunError> { + self.operator.on_start(ctx).await?; + Ok(()) + } + + async fn process_event( + &mut self, + input_idx: usize, + tracked: TrackedEvent, + ctx: &mut TaskContext, + ) -> Result { + match tracked.event { + StreamEvent::Data(batch) => { + let outputs = self.operator.process_data(input_idx, batch, ctx).await?; + self.dispatch_outputs(outputs, ctx).await?; + Ok(false) + } + StreamEvent::Watermark(wm) => { + let outputs = self.operator.process_watermark(wm, ctx).await?; + self.dispatch_outputs(outputs, ctx).await?; + self.forward_signal(StreamEvent::Watermark(wm), ctx).await?; + Ok(false) + } + StreamEvent::Barrier(barrier) => { + self.operator.snapshot_state(barrier, ctx).await?; + self.forward_signal(StreamEvent::Barrier(barrier), ctx) + .await?; + Ok(false) + } + StreamEvent::EndOfStream => { + self.forward_signal(StreamEvent::EndOfStream, ctx).await?; + Ok(true) + } + } + } + + async fn handle_control( + &mut self, + cmd: ControlCommand, + ctx: &mut TaskContext, + ) -> Result { + let mut stop = false; + + match &cmd { + ControlCommand::TriggerCheckpoint { barrier } => { + let b: CheckpointBarrier = barrier.clone().into(); + self.operator.snapshot_state(b, ctx).await?; + ctx.broadcast(StreamEvent::Barrier(b)).await?; + } + ControlCommand::Commit { epoch } => { + self.operator.commit_checkpoint(*epoch, ctx).await?; + } + ControlCommand::Stop { mode } if *mode == StopMode::Immediate => { + stop = true; + } + _ => {} + } + + Ok(stop) + } + + async fn on_close(&mut self, ctx: &mut TaskContext) -> Result<(), RunError> { + let close_outs = self.operator.on_close(ctx).await?; + self.dispatch_outputs(close_outs, ctx).await?; + Ok(()) + } +} diff --git a/src/runtime/streaming/execution/pipeline.rs b/src/runtime/streaming/execution/pipeline.rs new file mode 100644 index 00000000..d6ef06a3 --- /dev/null +++ b/src/runtime/streaming/execution/pipeline.rs @@ -0,0 +1,175 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use tokio::sync::mpsc::Receiver; +use tokio_stream::{StreamExt, StreamMap}; +use tracing::{Instrument, info, info_span}; + +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::Operator; +use crate::runtime::streaming::error::RunError; +use crate::runtime::streaming::execution::operator_chain::{ChainBuilder, OperatorDrive}; +use crate::runtime::streaming::execution::tracker::{ + barrier_aligner::{AlignmentStatus, BarrierAligner}, + watermark_tracker::WatermarkTracker, +}; +use crate::runtime::streaming::network::endpoint::BoxedEventStream; +use crate::runtime::streaming::protocol::{ + control::ControlCommand, + event::{StreamEvent, TrackedEvent}, +}; +use crate::sql::common::Watermark; + +pub struct Pipeline { + chain_head: Box, + ctx: TaskContext, + inboxes: Vec, + control_rx: Receiver, + + wm_tracker: WatermarkTracker, + barrier_aligner: BarrierAligner, + paused_streams: Vec>, +} + +impl Pipeline { + pub fn new( + operators: Vec>, + ctx: TaskContext, + inboxes: Vec, + control_rx: Receiver, + ) -> Result { + let input_count = inboxes.len(); + let chain_head = ChainBuilder::build(operators) + .ok_or_else(|| RunError::internal("Cannot build pipeline with empty operators"))?; + + let paused_streams = (0..input_count).map(|_| None).collect(); + + Ok(Self { + chain_head, + ctx, + inboxes, + control_rx, + wm_tracker: WatermarkTracker::new(input_count), + barrier_aligner: BarrierAligner::new(input_count), + paused_streams, + }) + } + + pub async fn run(mut self) -> Result<(), RunError> { + let span = info_span!( + "pipeline_run", + job_id = %self.ctx.job_id, + pipeline_id = self.ctx.pipeline_id + ); + + async move { + info!("Pipeline initializing..."); + self.chain_head.on_start(&mut self.ctx).await?; + + let mut active_streams = StreamMap::new(); + for (i, stream) in std::mem::take(&mut self.inboxes).into_iter().enumerate() { + active_streams.insert(i, stream); + } + + loop { + tokio::select! { + biased; + + Some(cmd) = self.control_rx.recv() => { + if self.chain_head.handle_control(cmd, &mut self.ctx).await? { + break; + } + } + + Some((idx, tracked_event)) = active_streams.next() => { + match tracked_event.event { + StreamEvent::Data(batch) => { + self.chain_head + .process_event( + idx, + TrackedEvent::control(StreamEvent::Data(batch)), + &mut self.ctx, + ) + .await?; + } + + StreamEvent::Barrier(barrier) => { + match self.barrier_aligner.mark(idx, &barrier) { + AlignmentStatus::Pending => { + if let Some(stream) = active_streams.remove(&idx) { + self.paused_streams[idx] = Some(stream); + } + } + AlignmentStatus::Complete => { + self.chain_head + .process_event( + idx, + TrackedEvent::control(StreamEvent::Barrier(barrier)), + &mut self.ctx, + ) + .await?; + + for i in 0..self.paused_streams.len() { + if let Some(stream) = self.paused_streams[i].take() { + active_streams.insert(i, stream); + } + } + } + } + } + + StreamEvent::Watermark(wm) => { + if let Some(aligned_wm) = self.wm_tracker.update(idx, wm) { + if let Watermark::EventTime(t) = aligned_wm { + self.ctx.advance_watermark(t); + } + self.chain_head + .process_event( + idx, + TrackedEvent::control(StreamEvent::Watermark(aligned_wm)), + &mut self.ctx, + ) + .await?; + } + } + + StreamEvent::EndOfStream => { + if self.wm_tracker.increment_eof() == self.wm_tracker.input_count() { + self.chain_head + .process_event( + idx, + TrackedEvent::control(StreamEvent::EndOfStream), + &mut self.ctx, + ) + .await?; + break; + } + } + } + } + + else => break, + } + } + + self.teardown().await + } + .instrument(span) + .await + } + + async fn teardown(mut self) -> Result<(), RunError> { + info!("Pipeline tearing down..."); + self.chain_head.on_close(&mut self.ctx).await?; + Ok(()) + } +} diff --git a/src/runtime/streaming/execution/source_driver.rs b/src/runtime/streaming/execution/source_driver.rs new file mode 100644 index 00000000..6813a82a --- /dev/null +++ b/src/runtime/streaming/execution/source_driver.rs @@ -0,0 +1,193 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use tokio::sync::mpsc::Receiver; +use tokio::time::{Instant, sleep}; +use tracing::{Instrument, info, info_span, warn}; + +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::source::{SourceEvent, SourceOperator}; +use crate::runtime::streaming::error::RunError; +use crate::runtime::streaming::execution::OperatorDrive; +use crate::runtime::streaming::protocol::{ + control::ControlCommand, + event::{StreamEvent, TrackedEvent}, +}; +use crate::sql::common::CheckpointBarrier; + +pub struct SourceDriver { + operator: Box, + chain_head: Option>, + ctx: TaskContext, + control_rx: Receiver, +} + +impl SourceDriver { + pub fn new( + operator: Box, + chain_head: Option>, + ctx: TaskContext, + control_rx: Receiver, + ) -> Self { + Self { + operator, + chain_head, + ctx, + control_rx, + } + } + + pub async fn run(mut self) -> Result<(), RunError> { + let span = info_span!( + "source_run", + job_id = %self.ctx.job_id, + pipeline_id = self.ctx.pipeline_id, + op = self.operator.name() + ); + + async move { + info!("SourceDriver initializing..."); + + self.operator.on_start(&mut self.ctx).await?; + if let Some(chain) = &mut self.chain_head { + chain.on_start(&mut self.ctx).await?; + } + + let idle_timeout = self.ctx.config().source_idle_timeout; + + let idle_delay = sleep(idle_timeout); + tokio::pin!(idle_delay); + + let mut is_idle = false; + + loop { + tokio::select! { + biased; + + Some(cmd) = self.control_rx.recv() => { + if self.handle_control(cmd).await? { + info!("SourceDriver received stop signal, breaking event loop."); + break; + } + } + + () = idle_delay.as_mut(), if is_idle => { + is_idle = false; + } + + fetch_res = self.operator.fetch_next(&mut self.ctx), if !is_idle => { + match fetch_res { + Ok(SourceEvent::Data(batch)) => { + self.dispatch_event(StreamEvent::Data(batch)).await?; + } + Ok(SourceEvent::Watermark(wm)) => { + self.dispatch_event(StreamEvent::Watermark(wm)).await?; + } + Ok(SourceEvent::Idle) => { + is_idle = true; + idle_delay + .as_mut() + .reset(Instant::now() + idle_timeout); + } + Ok(SourceEvent::EndOfStream) => { + self.dispatch_event(StreamEvent::EndOfStream).await?; + info!( + "Source '{}' reached EndOfStream, pipeline shutting down gracefully.", + self.operator.name() + ); + break; + } + Err(e) => { + warn!( + "Source operator '{}' encountered critical fetch error: {}", + self.operator.name(), + e + ); + return Err(RunError::Operator(e)); + } + } + } + + else => { + warn!("Control channel closed unexpectedly, SourceDriver shutting down."); + break; + } + } + } + + self.teardown().await + } + .instrument(span) + .await + } + + async fn dispatch_event(&mut self, event: StreamEvent) -> Result<(), RunError> { + if let Some(chain) = &mut self.chain_head { + chain + .process_event(0, TrackedEvent::control(event), &mut self.ctx) + .await?; + } else { + match event { + StreamEvent::Data(b) => self.ctx.collect(b).await?, + StreamEvent::Watermark(w) => { + self.ctx.broadcast(StreamEvent::Watermark(w)).await?; + } + StreamEvent::Barrier(b) => { + self.ctx.broadcast(StreamEvent::Barrier(b)).await?; + } + StreamEvent::EndOfStream => { + self.ctx.broadcast(StreamEvent::EndOfStream).await?; + } + } + } + Ok(()) + } + + async fn handle_control(&mut self, cmd: ControlCommand) -> Result { + let mut stop = false; + + match &cmd { + ControlCommand::TriggerCheckpoint { barrier } => { + let b: CheckpointBarrier = barrier.clone().into(); + self.operator.snapshot_state(b, &mut self.ctx).await?; + self.dispatch_event(StreamEvent::Barrier(b)).await?; + } + ControlCommand::Commit { epoch } => { + self.operator + .commit_checkpoint(*epoch, &mut self.ctx) + .await?; + } + ControlCommand::Stop { .. } => { + stop = true; + } + _ => {} + } + + if let Some(chain) = &mut self.chain_head + && chain.handle_control(cmd, &mut self.ctx).await? + { + stop = true; + } + + Ok(stop) + } + + async fn teardown(mut self) -> Result<(), RunError> { + info!("SourceDriver teardown initiated..."); + self.operator.on_close(&mut self.ctx).await?; + if let Some(chain) = &mut self.chain_head { + chain.on_close(&mut self.ctx).await?; + } + info!("SourceDriver teardown complete. Goodbye."); + Ok(()) + } +} diff --git a/src/runtime/streaming/execution/tracker/barrier_aligner.rs b/src/runtime/streaming/execution/tracker/barrier_aligner.rs new file mode 100644 index 00000000..4f954a7d --- /dev/null +++ b/src/runtime/streaming/execution/tracker/barrier_aligner.rs @@ -0,0 +1,55 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashSet; + +use crate::sql::common::CheckpointBarrier; + +#[derive(Debug)] +pub enum AlignmentStatus { + Pending, + Complete, +} + +#[derive(Debug)] +pub struct BarrierAligner { + input_count: usize, + current_epoch: Option, + reached_inputs: HashSet, +} + +impl BarrierAligner { + pub fn new(input_count: usize) -> Self { + Self { + input_count, + current_epoch: None, + reached_inputs: HashSet::new(), + } + } + + pub fn mark(&mut self, input_idx: usize, barrier: &CheckpointBarrier) -> AlignmentStatus { + if self.current_epoch != Some(barrier.epoch) { + self.current_epoch = Some(barrier.epoch); + self.reached_inputs.clear(); + } + + self.reached_inputs.insert(input_idx); + + if self.reached_inputs.len() == self.input_count { + self.current_epoch = None; + self.reached_inputs.clear(); + AlignmentStatus::Complete + } else { + AlignmentStatus::Pending + } + } +} diff --git a/src/runtime/streaming/execution/tracker/mod.rs b/src/runtime/streaming/execution/tracker/mod.rs new file mode 100644 index 00000000..edacf5b2 --- /dev/null +++ b/src/runtime/streaming/execution/tracker/mod.rs @@ -0,0 +1,14 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod barrier_aligner; +pub mod watermark_tracker; diff --git a/src/runtime/streaming/execution/tracker/watermark_tracker.rs b/src/runtime/streaming/execution/tracker/watermark_tracker.rs new file mode 100644 index 00000000..af6fd0bc --- /dev/null +++ b/src/runtime/streaming/execution/tracker/watermark_tracker.rs @@ -0,0 +1,112 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::runtime::streaming::protocol::event::{merge_watermarks, watermark_strictly_advances}; +use crate::sql::common::Watermark; + +#[derive(Debug)] +pub struct WatermarkTracker { + watermarks: Vec>, + current_min_watermark: Option, + eof_count: usize, +} + +impl WatermarkTracker { + pub fn new(input_count: usize) -> Self { + Self { + watermarks: vec![None; input_count], + current_min_watermark: None, + eof_count: 0, + } + } + + pub fn update(&mut self, input_idx: usize, wm: Watermark) -> Option { + self.watermarks[input_idx] = Some(wm); + + if self.watermarks.iter().any(|w| w.is_none()) { + return None; + } + + let new_min = merge_watermarks(&self.watermarks)?; + + if !watermark_strictly_advances(new_min, self.current_min_watermark) { + return None; + } + + self.current_min_watermark = Some(new_min); + Some(new_min) + } + + pub fn increment_eof(&mut self) -> usize { + self.eof_count += 1; + self.eof_count + } + + pub fn input_count(&self) -> usize { + self.watermarks.len() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::time::{Duration, SystemTime}; + + #[test] + fn no_emit_until_all_inputs_seen() { + let mut t = WatermarkTracker::new(2); + let w = Watermark::EventTime(SystemTime::UNIX_EPOCH + Duration::from_secs(3)); + assert!(t.update(0, w).is_none()); + let w2 = Watermark::EventTime(SystemTime::UNIX_EPOCH + Duration::from_secs(1)); + assert_eq!(t.update(1, w2), Some(w2)); + } + + #[test] + fn dedup_same_aligned() { + let mut t = WatermarkTracker::new(1); + let w = Watermark::EventTime(SystemTime::UNIX_EPOCH + Duration::from_secs(1)); + assert_eq!(t.update(0, w), Some(w)); + assert!(t.update(0, w).is_none()); + } + + #[test] + fn advances_only_when_min_strictly_increases() { + let mut t = WatermarkTracker::new(2); + let t1 = SystemTime::UNIX_EPOCH + Duration::from_secs(1); + let t5 = SystemTime::UNIX_EPOCH + Duration::from_secs(5); + assert!(t.update(0, Watermark::EventTime(t5)).is_none()); + assert_eq!( + t.update(1, Watermark::EventTime(t1)), + Some(Watermark::EventTime(t1)) + ); + let t3 = SystemTime::UNIX_EPOCH + Duration::from_secs(3); + assert_eq!( + t.update(1, Watermark::EventTime(t3)), + Some(Watermark::EventTime(t3)) + ); + assert!(t.update(1, Watermark::EventTime(t3)).is_none()); + } + + #[test] + fn backward_aligned_min_is_ignored() { + let mut t = WatermarkTracker::new(2); + let t5 = SystemTime::UNIX_EPOCH + Duration::from_secs(5); + let t10 = SystemTime::UNIX_EPOCH + Duration::from_secs(10); + assert!(t.update(0, Watermark::EventTime(t10)).is_none()); + assert_eq!( + t.update(1, Watermark::EventTime(t5)), + Some(Watermark::EventTime(t5)) + ); + let t2 = SystemTime::UNIX_EPOCH + Duration::from_secs(2); + assert!(t.update(0, Watermark::EventTime(t2)).is_none()); + } +} diff --git a/src/runtime/streaming/factory/connector/dispatchers.rs b/src/runtime/streaming/factory/connector/dispatchers.rs new file mode 100644 index 00000000..430a49f9 --- /dev/null +++ b/src/runtime/streaming/factory/connector/dispatchers.rs @@ -0,0 +1,37 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use anyhow::Result; + +use crate::runtime::streaming::api::operator::ConstructedOperator; +use crate::runtime::streaming::factory::global::Registry; +use crate::runtime::streaming::factory::operator_constructor::OperatorConstructor; + +use super::kafka::KafkaConnectorDispatcher; + +pub struct ConnectorSourceDispatcher; + +impl OperatorConstructor for ConnectorSourceDispatcher { + fn with_config(&self, config: &[u8], registry: Arc) -> Result { + KafkaConnectorDispatcher.with_config(config, registry) + } +} + +pub struct ConnectorSinkDispatcher; + +impl OperatorConstructor for ConnectorSinkDispatcher { + fn with_config(&self, config: &[u8], registry: Arc) -> Result { + KafkaConnectorDispatcher.with_config(config, registry) + } +} diff --git a/src/runtime/streaming/factory/connector/kafka.rs b/src/runtime/streaming/factory/connector/kafka.rs new file mode 100644 index 00000000..75135197 --- /dev/null +++ b/src/runtime/streaming/factory/connector/kafka.rs @@ -0,0 +1,252 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{Context, Result, bail}; +use prost::Message; +use std::collections::HashMap; +use std::num::NonZeroU32; +use std::sync::Arc; + +use protocol::function_stream_graph::connector_op::Config; +use protocol::function_stream_graph::{ + BadDataPolicy, ConnectorOp, DecimalEncodingProto, FormatConfig, KafkaAuthConfig, + KafkaOffsetMode, KafkaReadMode, KafkaSinkCommitMode, KafkaSinkConfig, KafkaSourceConfig, + TimestampFormatProto, +}; +use tracing::info; + +use crate::runtime::streaming::api::operator::ConstructedOperator; +use crate::runtime::streaming::api::source::SourceOffset; +use crate::runtime::streaming::factory::global::Registry; +use crate::runtime::streaming::factory::operator_constructor::OperatorConstructor; +use crate::runtime::streaming::format::{ + BadDataPolicy as RtBadDataPolicy, DataSerializer, DecimalEncoding as RtDecimalEncoding, + Format as RuntimeFormat, JsonFormat as RuntimeJsonFormat, TimestampFormat as RtTimestampFormat, +}; +use crate::runtime::streaming::operators::sink::kafka::{ConsistencyMode, KafkaSinkOperator}; +use crate::runtime::streaming::operators::source::kafka::{ + BufferedDeserializer, KafkaSourceOperator, +}; +use crate::sql::common::FsSchema; + +const DEFAULT_SOURCE_BATCH_SIZE: usize = 1024; + +// ─────────────── Proto → Runtime type conversions ─────────────── + +fn proto_format_to_runtime(fmt: &Option) -> Result { + let cfg = fmt.as_ref().context("FormatConfig is required")?; + match &cfg.format { + Some(protocol::function_stream_graph::format_config::Format::Json(j)) => { + Ok(RuntimeFormat::Json(RuntimeJsonFormat { + timestamp_format: match j.timestamp_format() { + TimestampFormatProto::TimestampRfc3339 => RtTimestampFormat::RFC3339, + TimestampFormatProto::TimestampUnixMillis => RtTimestampFormat::UnixMillis, + }, + decimal_encoding: match j.decimal_encoding() { + DecimalEncodingProto::DecimalNumber => RtDecimalEncoding::Number, + DecimalEncodingProto::DecimalString => RtDecimalEncoding::String, + DecimalEncodingProto::DecimalBytes => RtDecimalEncoding::Bytes, + }, + include_schema: j.include_schema, + })) + } + Some(protocol::function_stream_graph::format_config::Format::RawString(_)) => { + Ok(RuntimeFormat::RawString) + } + Some(protocol::function_stream_graph::format_config::Format::RawBytes(_)) => { + Ok(RuntimeFormat::RawBytes) + } + None => bail!("FormatConfig has no format variant set"), + } +} + +fn proto_bad_data_to_runtime(policy: i32) -> RtBadDataPolicy { + match BadDataPolicy::try_from(policy) { + Ok(BadDataPolicy::BadDataDrop) => RtBadDataPolicy::Drop, + _ => RtBadDataPolicy::Fail, + } +} + +fn proto_offset_to_runtime(mode: i32) -> SourceOffset { + match KafkaOffsetMode::try_from(mode) { + Ok(KafkaOffsetMode::KafkaOffsetLatest) => SourceOffset::Latest, + Ok(KafkaOffsetMode::KafkaOffsetEarliest) => SourceOffset::Earliest, + _ => SourceOffset::Group, + } +} + +fn build_auth_client_configs(auth: &Option) -> HashMap { + let mut out = HashMap::new(); + let Some(auth) = auth else { return out }; + match &auth.auth { + Some(protocol::function_stream_graph::kafka_auth_config::Auth::Sasl(sasl)) => { + out.insert("security.protocol".to_string(), sasl.protocol.clone()); + out.insert("sasl.mechanism".to_string(), sasl.mechanism.clone()); + out.insert("sasl.username".to_string(), sasl.username.clone()); + out.insert("sasl.password".to_string(), sasl.password.clone()); + } + Some(protocol::function_stream_graph::kafka_auth_config::Auth::AwsMskIam(iam)) => { + out.insert("security.protocol".to_string(), "SASL_SSL".to_string()); + out.insert("sasl.mechanism".to_string(), "OAUTHBEARER".to_string()); + out.insert( + "sasl.oauthbearer.extensions".to_string(), + format!("logicalCluster=aws_msk;aws_region={}", iam.region), + ); + } + _ => {} + } + out +} + +fn merge_client_configs( + auth: &Option, + extra: &HashMap, +) -> HashMap { + let mut configs = build_auth_client_configs(auth); + for (k, v) in extra { + configs.insert(k.clone(), v.clone()); + } + configs +} + +pub struct KafkaConnectorDispatcher; + +impl OperatorConstructor for KafkaConnectorDispatcher { + fn with_config(&self, payload: &[u8], _registry: Arc) -> Result { + let op = ConnectorOp::decode(payload).context("Failed to decode ConnectorOp protobuf")?; + + let fs_schema = op + .fs_schema + .as_ref() + .map(|fs| FsSchema::try_from(fs.clone())) + .transpose() + .map_err(|e| anyhow::anyhow!("{e}"))?; + + match op.config { + Some(Config::KafkaSource(ref cfg)) => { + Self::build_kafka_source(&op.name, cfg, fs_schema) + } + Some(Config::KafkaSink(ref cfg)) => Self::build_kafka_sink(&op.name, cfg, fs_schema), + Some(Config::Generic(_)) => bail!( + "ConnectorOp '{}': GenericConnectorConfig dispatch not yet implemented", + op.name + ), + None => bail!("ConnectorOp '{}' has no configuration payload", op.name), + } + } +} + +impl KafkaConnectorDispatcher { + fn build_kafka_source( + _name: &str, + cfg: &KafkaSourceConfig, + fs_schema: Option, + ) -> Result { + info!(topic = %cfg.topic, "Constructing Kafka Source"); + + let fs = fs_schema.context("fs_schema is required for Kafka Source")?; + let client_configs = merge_client_configs(&cfg.auth, &cfg.client_configs); + + let mut final_configs = client_configs; + if cfg.read_mode() == KafkaReadMode::KafkaReadCommitted { + final_configs.insert("isolation.level".to_string(), "read_committed".to_string()); + } + + let runtime_format = proto_format_to_runtime(&cfg.format)?; + let bad_data = proto_bad_data_to_runtime(cfg.bad_data_policy); + + let deserializer = Box::new(BufferedDeserializer::new( + runtime_format, + fs.schema.clone(), + bad_data, + DEFAULT_SOURCE_BATCH_SIZE, + )); + + let rate = NonZeroU32::new(cfg.rate_limit_msgs_per_sec.max(1)) + .unwrap_or_else(|| NonZeroU32::new(1_000_000).expect("nonzero")); + + let source_op = KafkaSourceOperator::new( + cfg.topic.clone(), + cfg.bootstrap_servers.clone(), + cfg.group_id.clone(), + cfg.group_id_prefix.clone(), + proto_offset_to_runtime(cfg.offset_mode), + final_configs, + rate, + vec![], + deserializer, + ); + + Ok(ConstructedOperator::Source(Box::new(source_op))) + } + + fn build_kafka_sink( + _name: &str, + cfg: &KafkaSinkConfig, + fs_schema: Option, + ) -> Result { + info!(topic = %cfg.topic, "Constructing Kafka Sink"); + + let fs_in = fs_schema.context("fs_schema is required for Kafka Sink")?; + let client_configs = merge_client_configs(&cfg.auth, &cfg.client_configs); + + let consistency = match cfg.commit_mode() { + KafkaSinkCommitMode::KafkaSinkExactlyOnce => ConsistencyMode::ExactlyOnce, + KafkaSinkCommitMode::KafkaSinkAtLeastOnce => ConsistencyMode::AtLeastOnce, + }; + + let runtime_format = proto_format_to_runtime(&cfg.format)?; + let fs = sink_fs_schema_adjusted(fs_in, &cfg.key_field, &cfg.timestamp_field)?; + let serializer = DataSerializer::new(runtime_format, fs.schema.clone()); + + let sink_op = KafkaSinkOperator::new( + cfg.topic.clone(), + cfg.bootstrap_servers.clone(), + consistency, + client_configs, + fs, + serializer, + ); + + Ok(ConstructedOperator::Operator(Box::new(sink_op))) + } +} + +fn sink_fs_schema_adjusted( + fs: FsSchema, + key_field: &Option, + timestamp_field: &Option, +) -> Result { + if key_field.is_none() && timestamp_field.is_none() { + return Ok(fs); + } + let schema = fs.schema.clone(); + let ts = if let Some(name) = timestamp_field { + schema + .column_with_name(name) + .ok_or_else(|| anyhow::anyhow!("timestamp column '{name}' not found in schema"))? + .0 + } else { + fs.timestamp_index + }; + let keys = fs.clone_storage_key_indices(); + let routing = if let Some(name) = key_field { + let k = schema + .column_with_name(name) + .ok_or_else(|| anyhow::anyhow!("key column '{name}' not found in schema"))? + .0; + Some(vec![k]) + } else { + fs.clone_routing_key_indices() + }; + Ok(FsSchema::new(schema, ts, keys, routing)) +} diff --git a/src/runtime/streaming/factory/connector/mod.rs b/src/runtime/streaming/factory/connector/mod.rs new file mode 100644 index 00000000..381de89c --- /dev/null +++ b/src/runtime/streaming/factory/connector/mod.rs @@ -0,0 +1,17 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod dispatchers; +pub mod kafka; + +pub use dispatchers::{ConnectorSinkDispatcher, ConnectorSourceDispatcher}; +pub use kafka::KafkaConnectorDispatcher; diff --git a/src/runtime/source/mod.rs b/src/runtime/streaming/factory/global/mod.rs similarity index 90% rename from src/runtime/source/mod.rs rename to src/runtime/streaming/factory/global/mod.rs index 8a05bf30..a8630aba 100644 --- a/src/runtime/source/mod.rs +++ b/src/runtime/streaming/factory/global/mod.rs @@ -10,6 +10,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Source module +mod session_registry; -// TODO: Add source implementation here +pub use session_registry::Registry; diff --git a/src/runtime/streaming/factory/global/session_registry.rs b/src/runtime/streaming/factory/global/session_registry.rs new file mode 100644 index 00000000..e65988c5 --- /dev/null +++ b/src/runtime/streaming/factory/global/session_registry.rs @@ -0,0 +1,65 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashSet; +use std::sync::Arc; + +use datafusion::common::Result as DfResult; +use datafusion::execution::FunctionRegistry; +use datafusion::execution::context::SessionContext; +use datafusion::logical_expr::planner::ExprPlanner; +use datafusion::logical_expr::{AggregateUDF, ScalarUDF, WindowUDF}; + +/// Global session registry used by DataFusion [`FunctionRegistry`] integration. +pub struct Registry { + ctx: SessionContext, +} + +impl Default for Registry { + fn default() -> Self { + Self::new() + } +} + +impl Registry { + pub fn new() -> Self { + Self { + ctx: SessionContext::new(), + } + } + + pub fn session_context(&self) -> &SessionContext { + &self.ctx + } +} + +impl FunctionRegistry for Registry { + fn udfs(&self) -> HashSet { + self.ctx.udfs() + } + + fn udf(&self, name: &str) -> DfResult> { + self.ctx.udf(name) + } + + fn udaf(&self, name: &str) -> DfResult> { + self.ctx.udaf(name) + } + + fn udwf(&self, name: &str) -> DfResult> { + self.ctx.udwf(name) + } + + fn expr_planners(&self) -> Vec> { + self.ctx.expr_planners() + } +} diff --git a/src/runtime/streaming/factory/mod.rs b/src/runtime/streaming/factory/mod.rs new file mode 100644 index 00000000..ff30a13e --- /dev/null +++ b/src/runtime/streaming/factory/mod.rs @@ -0,0 +1,53 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod connector; +pub mod global; + +mod operator_constructor; +mod operator_factory; + +use tracing::info; + +use crate::sql::common::constants::factory_operator_name; + +#[allow(unused_imports)] +pub use connector::{ConnectorSinkDispatcher, ConnectorSourceDispatcher, KafkaConnectorDispatcher}; +pub use global::Registry; +pub use operator_factory::OperatorFactory; + +fn register_builtin_connectors(factory: &mut OperatorFactory) { + factory.register( + factory_operator_name::CONNECTOR_SOURCE, + Box::new(connector::ConnectorSourceDispatcher), + ); + factory.register( + factory_operator_name::CONNECTOR_SINK, + Box::new(connector::ConnectorSinkDispatcher), + ); +} + +fn register_kafka_connector_plugins(factory: &mut OperatorFactory) { + factory.register( + factory_operator_name::KAFKA_SOURCE, + Box::new(KafkaConnectorDispatcher), + ); + factory.register( + factory_operator_name::KAFKA_SINK, + Box::new(KafkaConnectorDispatcher), + ); + info!( + "Registered Kafka connector plugins ({}, {})", + factory_operator_name::KAFKA_SOURCE, + factory_operator_name::KAFKA_SINK + ); +} diff --git a/src/runtime/streaming/factory/operator_constructor.rs b/src/runtime/streaming/factory/operator_constructor.rs new file mode 100644 index 00000000..5d0ff7d7 --- /dev/null +++ b/src/runtime/streaming/factory/operator_constructor.rs @@ -0,0 +1,22 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::Result; +use std::sync::Arc; + +use crate::runtime::streaming::api::operator::ConstructedOperator; +use crate::runtime::streaming::factory::global::Registry; + +/// Builds a [`ConstructedOperator`] from serialized configuration and a [`Registry`]. +pub trait OperatorConstructor: Send + Sync { + fn with_config(&self, config: &[u8], registry: Arc) -> Result; +} diff --git a/src/runtime/streaming/factory/operator_factory.rs b/src/runtime/streaming/factory/operator_factory.rs new file mode 100644 index 00000000..1ce04eeb --- /dev/null +++ b/src/runtime/streaming/factory/operator_factory.rs @@ -0,0 +1,284 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::operator_constructor::OperatorConstructor; +use crate::runtime::streaming::api::operator::ConstructedOperator; +use crate::runtime::streaming::factory::connector::{ + ConnectorSinkDispatcher, ConnectorSourceDispatcher, +}; +use crate::runtime::streaming::factory::global::Registry; +use crate::runtime::streaming::operators::grouping::IncrementalAggregatingConstructor; +use crate::runtime::streaming::operators::joins::{ + InstantJoinConstructor, JoinWithExpirationConstructor, +}; +use anyhow::{Result, anyhow}; +use prost::Message; +use protocol::function_stream_graph::ProjectionOperator as ProjectionOperatorProto; +use std::collections::HashMap; +use std::sync::Arc; + +use crate::runtime::streaming::operators::watermark::WatermarkGeneratorConstructor; +use crate::runtime::streaming::operators::windows::{ + SessionAggregatingWindowConstructor, SlidingAggregatingWindowConstructor, + TumblingAggregateWindowConstructor, WindowFunctionConstructor, +}; +use crate::runtime::streaming::operators::{ + KeyExecutionOperator, ProjectionOperator, StatelessPhysicalExecutor, ValueExecutionOperator, +}; +use protocol::function_stream_graph::{ + ExpressionWatermarkConfig, JoinOperator as JoinOperatorProto, KeyPlanOperator as KeyByProto, + SessionWindowAggregateOperator, SlidingWindowAggregateOperator, + TumblingWindowAggregateOperator, UpdatingAggregateOperator, ValuePlanOperator, + WindowFunctionOperator as WindowFunctionProto, +}; + +use crate::sql::logical_node::logical::OperatorName; + +pub struct OperatorFactory { + constructors: HashMap>, + registry: Arc, +} + +impl OperatorFactory { + pub fn new(registry: Arc) -> Self { + let mut factory = Self { + constructors: HashMap::new(), + registry, + }; + factory.register_builtins(); + factory + } + + pub fn register(&mut self, name: &str, constructor: Box) { + self.constructors.insert(name.to_string(), constructor); + } + + pub fn register_named( + &mut self, + name: OperatorName, + constructor: Box, + ) { + self.register(name.as_registry_key(), constructor); + } + + pub fn create_operator(&self, name: &str, payload: &[u8]) -> Result { + let ctor = self.constructors.get(name).ok_or_else(|| { + anyhow!( + "FATAL: Operator '{}' not found in Factory Registry. \ + Ensure the worker is compiled with the correct plugins.", + name + ) + })?; + + ctor.with_config(payload, self.registry.clone()) + } + + pub fn registered_operators(&self) -> Vec<&str> { + self.constructors.keys().map(|s| s.as_str()).collect() + } + + fn register_builtins(&mut self) { + self.register_named( + OperatorName::TumblingWindowAggregate, + Box::new(TumblingWindowBridge), + ); + self.register_named( + OperatorName::SlidingWindowAggregate, + Box::new(SlidingWindowBridge), + ); + self.register_named( + OperatorName::SessionWindowAggregate, + Box::new(SessionWindowBridge), + ); + + self.register_named(OperatorName::ExpressionWatermark, Box::new(WatermarkBridge)); + + // ─── SQL Window Function ─── + self.register_named(OperatorName::WindowFunction, Box::new(WindowFunctionBridge)); + + // ─── Join ─── + self.register_named(OperatorName::Join, Box::new(JoinWithExpirationBridge)); + self.register_named(OperatorName::InstantJoin, Box::new(InstantJoinBridge)); + self.register_named(OperatorName::LookupJoin, Box::new(LookupJoinBridge)); + + self.register_named( + OperatorName::UpdatingAggregate, + Box::new(IncrementalAggregateBridge), + ); + + self.register_named(OperatorName::KeyBy, Box::new(KeyByBridge)); + + self.register_named(OperatorName::Projection, Box::new(ProjectionConstructor)); + self.register_named(OperatorName::Value, Box::new(ValueBridge)); + self.register_named( + OperatorName::ConnectorSource, + Box::new(ConnectorSourceBridge), + ); + self.register_named(OperatorName::ConnectorSink, Box::new(ConnectorSinkBridge)); + + crate::runtime::streaming::factory::register_kafka_connector_plugins(self); + } +} + +struct TumblingWindowBridge; +impl OperatorConstructor for TumblingWindowBridge { + fn with_config(&self, config: &[u8], registry: Arc) -> Result { + let proto = TumblingWindowAggregateOperator::decode(config) + .map_err(|e| anyhow!("Decode TumblingWindowAggregateOperator failed: {e}"))?; + let op = TumblingAggregateWindowConstructor.with_config(proto, registry)?; + Ok(ConstructedOperator::Operator(Box::new(op))) + } +} + +struct SlidingWindowBridge; +impl OperatorConstructor for SlidingWindowBridge { + fn with_config(&self, config: &[u8], registry: Arc) -> Result { + let proto = SlidingWindowAggregateOperator::decode(config) + .map_err(|e| anyhow!("Decode SlidingWindowAggregateOperator failed: {e}"))?; + let op = SlidingAggregatingWindowConstructor.with_config(proto, registry)?; + Ok(ConstructedOperator::Operator(Box::new(op))) + } +} + +struct SessionWindowBridge; +impl OperatorConstructor for SessionWindowBridge { + fn with_config(&self, config: &[u8], registry: Arc) -> Result { + let proto = SessionWindowAggregateOperator::decode(config) + .map_err(|e| anyhow!("Decode SessionWindowAggregateOperator failed: {e}"))?; + let op = SessionAggregatingWindowConstructor.with_config(proto, registry)?; + Ok(ConstructedOperator::Operator(Box::new(op))) + } +} + +struct WatermarkBridge; +impl OperatorConstructor for WatermarkBridge { + fn with_config(&self, config: &[u8], registry: Arc) -> Result { + let proto = ExpressionWatermarkConfig::decode(config) + .map_err(|e| anyhow!("Decode ExpressionWatermarkConfig failed: {e}"))?; + let op = WatermarkGeneratorConstructor.with_config(proto, registry)?; + Ok(ConstructedOperator::Operator(Box::new(op))) + } +} + +struct WindowFunctionBridge; +impl OperatorConstructor for WindowFunctionBridge { + fn with_config(&self, config: &[u8], registry: Arc) -> Result { + let proto = WindowFunctionProto::decode(config) + .map_err(|e| anyhow!("Decode WindowFunctionOperator failed: {e}"))?; + let op = WindowFunctionConstructor.with_config(proto, registry)?; + Ok(ConstructedOperator::Operator(Box::new(op))) + } +} + +struct JoinWithExpirationBridge; +impl OperatorConstructor for JoinWithExpirationBridge { + fn with_config(&self, config: &[u8], registry: Arc) -> Result { + let proto = JoinOperatorProto::decode(config) + .map_err(|e| anyhow!("Decode JoinOperator (expiration) failed: {e}"))?; + let op = JoinWithExpirationConstructor.with_config(proto, registry)?; + Ok(ConstructedOperator::Operator(Box::new(op))) + } +} + +struct InstantJoinBridge; +impl OperatorConstructor for InstantJoinBridge { + fn with_config(&self, config: &[u8], registry: Arc) -> Result { + let proto = JoinOperatorProto::decode(config) + .map_err(|e| anyhow!("Decode JoinOperator (instant) failed: {e}"))?; + let op = InstantJoinConstructor.with_config(proto, registry)?; + Ok(ConstructedOperator::Operator(Box::new(op))) + } +} + +struct LookupJoinBridge; +impl OperatorConstructor for LookupJoinBridge { + fn with_config(&self, _config: &[u8], _registry: Arc) -> Result { + Err(anyhow!( + "LookupJoin is not supported in the current runtime" + )) + } +} + +struct IncrementalAggregateBridge; +impl OperatorConstructor for IncrementalAggregateBridge { + fn with_config(&self, config: &[u8], registry: Arc) -> Result { + let proto = UpdatingAggregateOperator::decode(config) + .map_err(|e| anyhow!("Decode UpdatingAggregateOperator failed: {e}"))?; + let op = IncrementalAggregatingConstructor.with_config(proto, registry)?; + Ok(ConstructedOperator::Operator(Box::new(op))) + } +} + +struct KeyByBridge; +impl OperatorConstructor for KeyByBridge { + fn with_config(&self, config: &[u8], registry: Arc) -> Result { + let proto = KeyByProto::decode(config) + .map_err(|e| anyhow!("Decode KeyPlanOperator failed: {e}"))?; + let executor = StatelessPhysicalExecutor::new(&proto.physical_plan, registry.as_ref()) + .map_err(|e| anyhow!("build key execution plan '{}': {e}", proto.name))?; + let name = if proto.name.is_empty() { + OperatorName::KeyBy.to_string() + } else { + proto.name.clone() + }; + let key_indices: Vec = proto.key_fields.iter().map(|&f| f as usize).collect(); + let op = KeyExecutionOperator::new(name, executor, key_indices); + Ok(ConstructedOperator::Operator(Box::new(op))) + } +} + +pub struct ProjectionConstructor; + +impl OperatorConstructor for ProjectionConstructor { + fn with_config(&self, payload: &[u8], registry: Arc) -> Result { + let proto = ProjectionOperatorProto::decode(payload)?; + let op = ProjectionOperator::from_proto(proto, registry)?; + Ok(ConstructedOperator::Operator(Box::new(op))) + } +} + +struct ValueBridge; +impl OperatorConstructor for ValueBridge { + fn with_config(&self, config: &[u8], registry: Arc) -> Result { + let proto = ValuePlanOperator::decode(config) + .map_err(|e| anyhow!("Decode ValuePlanOperator failed: {e}"))?; + let op = ValueExecutionConstructor.with_config(proto, registry)?; + Ok(ConstructedOperator::Operator(Box::new(op))) + } +} + +struct ConnectorSourceBridge; +impl OperatorConstructor for ConnectorSourceBridge { + fn with_config(&self, config: &[u8], registry: Arc) -> Result { + ConnectorSourceDispatcher.with_config(config, registry) + } +} + +struct ConnectorSinkBridge; +impl OperatorConstructor for ConnectorSinkBridge { + fn with_config(&self, config: &[u8], registry: Arc) -> Result { + ConnectorSinkDispatcher.with_config(config, registry) + } +} + +struct ValueExecutionConstructor; +impl ValueExecutionConstructor { + fn with_config( + &self, + config: ValuePlanOperator, + registry: Arc, + ) -> Result { + let executor = StatelessPhysicalExecutor::new(&config.physical_plan, registry.as_ref()) + .map_err(|e| anyhow!("build value execution plan '{}': {e}", config.name))?; + Ok(ValueExecutionOperator::new(config.name, executor)) + } +} diff --git a/src/runtime/streaming/format/config.rs b/src/runtime/streaming/format/config.rs new file mode 100644 index 00000000..15a58008 --- /dev/null +++ b/src/runtime/streaming/format/config.rs @@ -0,0 +1,47 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub enum TimestampFormat { + RFC3339, + UnixMillis, + UnixSeconds, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub enum DecimalEncoding { + String, + Number, + Bytes, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub enum BadDataPolicy { + Fail, + Drop, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct JsonFormat { + pub timestamp_format: TimestampFormat, + pub decimal_encoding: DecimalEncoding, + pub include_schema: bool, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum Format { + Json(JsonFormat), + RawString, + RawBytes, +} diff --git a/src/runtime/streaming/format/deserializer.rs b/src/runtime/streaming/format/deserializer.rs new file mode 100644 index 00000000..4058908f --- /dev/null +++ b/src/runtime/streaming/format/deserializer.rs @@ -0,0 +1,230 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{Context, Result, anyhow}; +use arrow_array::builder::{BinaryBuilder, StringBuilder, TimestampNanosecondBuilder}; +use arrow_array::{ArrayRef, RecordBatch}; +use arrow_json::reader::ReaderBuilder; +use arrow_schema::{Schema, SchemaRef}; +use std::sync::Arc; +use tracing::{debug, warn}; + +use super::config::{BadDataPolicy, Format}; +use crate::sql::common::TIMESTAMP_FIELD; + +/// `DataDeserializer` handles high-throughput message transformation +/// into Apache Arrow `RecordBatch`. +pub struct DataDeserializer { + format: Format, + final_schema: SchemaRef, + decoder_schema: SchemaRef, + bad_data_policy: BadDataPolicy, +} + +impl DataDeserializer { + pub fn new(format: Format, schema: SchemaRef, bad_data_policy: BadDataPolicy) -> Self { + let decoder_schema = schema_without_timestamp(schema.as_ref()); + Self { + format, + final_schema: schema, + decoder_schema, + bad_data_policy, + } + } + + /// High-performance entry point for batch deserialization. + pub fn deserialize_batch_with_kafka_timestamps( + &self, + messages: &[&[u8]], + kafka_timestamps_ms: &[u64], + ) -> Result { + if messages.is_empty() { + return Ok(RecordBatch::new_empty(self.final_schema.clone())); + } + + // Defensive check: align timestamps with messages + let ts_len = kafka_timestamps_ms.len(); + let msg_len = messages.len(); + if ts_len > 0 && ts_len != msg_len { + warn!(msg_len, ts_len, "Kafka timestamps count mismatch"); + } + + match &self.format { + Format::Json(_) => self.deserialize_json(messages, kafka_timestamps_ms), + Format::RawString => self.deserialize_raw_string(messages, kafka_timestamps_ms), + Format::RawBytes => self.deserialize_raw_bytes(messages, kafka_timestamps_ms), + } + } + + /// JSON Deserialization with Row-Level Fault Tolerance. + /// Performance Strategy: Uses an NDJSON (Newline Delimited JSON) approach + /// but isolates malformed rows prior to full Arrow decoding. + fn deserialize_json( + &self, + messages: &[&[u8]], + kafka_timestamps_ms: &[u64], + ) -> Result { + let mut valid_messages = Vec::with_capacity(messages.len()); + let mut valid_indices = Vec::with_capacity(messages.len()); + let mut total_size = 0; + + // Step 1: Pre-scan for data quality (Fault Isolation) + for (i, msg) in messages.iter().enumerate() { + // Fast-path: Check if it's a valid JSON object/array without full binding + if serde_json::from_slice::(msg).is_ok() { + valid_messages.push(*msg); + valid_indices.push(i); + total_size += msg.len() + 1; // +1 for newline + } else { + match self.bad_data_policy { + BadDataPolicy::Fail => { + return Err(anyhow!("Invalid JSON encountered at index {}", i)); + } + BadDataPolicy::Drop => { + debug!(index = i, "Dropped malformed JSON row"); + continue; + } + } + } + } + + if valid_messages.is_empty() { + return Ok(RecordBatch::new_empty(self.final_schema.clone())); + } + + // Step 2: Batch Decode valid rows + let mut buffer = Vec::with_capacity(total_size); + for msg in valid_messages { + buffer.extend_from_slice(msg); + buffer.push(b'\n'); + } + + let mut decoder = ReaderBuilder::new(self.decoder_schema.clone()) + .with_strict_mode(false) + .build_decoder() + .context("Failed to build Arrow JSON decoder")?; + + decoder + .decode(&buffer) + .context("Arrow batch decoding failed")?; + + let decoded_batch = decoder + .flush()? + .ok_or_else(|| anyhow!("Decoder returned empty batch after successful validation"))?; + + // Step 3: Re-inject Event-Time Column + self.rebuild_with_timestamp(decoded_batch, kafka_timestamps_ms, &valid_indices) + } + + fn deserialize_raw_string( + &self, + messages: &[&[u8]], + kafka_timestamps_ms: &[u64], + ) -> Result { + self.decoder_schema + .index_of("value") + .context("Schema must contain 'value' for RawString")?; + + let mut builder = + StringBuilder::with_capacity(messages.len(), messages.iter().map(|m| m.len()).sum()); + for msg in messages { + builder.append_value(String::from_utf8_lossy(msg)); + } + + let mut columns: Vec = Vec::with_capacity(self.decoder_schema.fields().len()); + columns.push(Arc::new(builder.finish())); + + let decoded_batch = RecordBatch::try_new(self.decoder_schema.clone(), columns)?; + let indices: Vec = (0..messages.len()).collect(); + + self.rebuild_with_timestamp(decoded_batch, kafka_timestamps_ms, &indices) + } + + fn deserialize_raw_bytes( + &self, + messages: &[&[u8]], + kafka_timestamps_ms: &[u64], + ) -> Result { + self.decoder_schema + .index_of("value") + .context("Schema must contain 'value' for RawBytes")?; + + let mut builder = + BinaryBuilder::with_capacity(messages.len(), messages.iter().map(|m| m.len()).sum()); + for msg in messages { + builder.append_value(msg); + } + + let mut columns: Vec = Vec::with_capacity(self.decoder_schema.fields().len()); + columns.push(Arc::new(builder.finish())); + + let decoded_batch = RecordBatch::try_new(self.decoder_schema.clone(), columns)?; + let indices: Vec = (0..messages.len()).collect(); + + self.rebuild_with_timestamp(decoded_batch, kafka_timestamps_ms, &indices) + } + + /// Assembler: Merges the decoded Batch with external Event-Time (Watermark) data. + fn rebuild_with_timestamp( + &self, + decoded_batch: RecordBatch, + kafka_timestamps_ms: &[u64], + valid_indices: &[usize], + ) -> Result { + let num_rows = decoded_batch.num_rows(); + + // Safety check for indices + if valid_indices.len() != num_rows { + return Err(anyhow!( + "Alignment error: valid rows ({}) != decoded rows ({})", + valid_indices.len(), + num_rows + )); + } + + // 1. Build Timestamp Column (Nanoseconds for Arrow standard) + let mut ts_builder = TimestampNanosecondBuilder::with_capacity(num_rows); + for &idx in valid_indices { + let ms = kafka_timestamps_ms.get(idx).copied().unwrap_or(0); + ts_builder.append_value((ms as i64).saturating_mul(1_000_000)); + } + let timestamp_col: ArrayRef = Arc::new(ts_builder.finish()); + + // 2. Final Assembly based on Target Schema + let mut final_columns = Vec::with_capacity(self.final_schema.fields().len()); + for field in self.final_schema.fields() { + if field.name() == TIMESTAMP_FIELD { + final_columns.push(timestamp_col.clone()); + } else { + let col = decoded_batch.column_by_name(field.name()).ok_or_else(|| { + anyhow!("Field '{}' not found in decoded batch", field.name()) + })?; + final_columns.push(col.clone()); + } + } + + RecordBatch::try_new(self.final_schema.clone(), final_columns) + .context("Failed to assemble final RecordBatch with event-time") + } +} + +/// Helper: Strips the specialized timestamp field to allow the raw decoder +/// to focus only on payload data. +fn schema_without_timestamp(schema: &Schema) -> SchemaRef { + let fields = schema + .fields() + .iter() + .filter(|f| f.name() != TIMESTAMP_FIELD) + .cloned() + .collect::>(); + Arc::new(Schema::new_with_metadata(fields, schema.metadata().clone())) +} diff --git a/src/runtime/streaming/format/json_encoder.rs b/src/runtime/streaming/format/json_encoder.rs new file mode 100644 index 00000000..8be2d649 --- /dev/null +++ b/src/runtime/streaming/format/json_encoder.rs @@ -0,0 +1,197 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use arrow_array::{ + Array, BinaryArray, Decimal128Array, TimestampMicrosecondArray, TimestampMillisecondArray, + TimestampNanosecondArray, TimestampSecondArray, +}; +use arrow_json::writer::NullableEncoder; +use arrow_json::{Encoder, EncoderFactory, EncoderOptions}; +use arrow_schema::{ArrowError, DataType, FieldRef, TimeUnit}; +use base64::Engine; +use base64::prelude::BASE64_STANDARD; +use std::io::Write; + +use super::config::{DecimalEncoding, TimestampFormat}; + +#[derive(Debug)] +pub struct CustomEncoderFactory { + pub timestamp_format: TimestampFormat, + pub decimal_encoding: DecimalEncoding, +} + +impl EncoderFactory for CustomEncoderFactory { + fn make_default_encoder<'a>( + &self, + _field: &'a FieldRef, + array: &'a dyn Array, + _options: &'a EncoderOptions, + ) -> Result>, ArrowError> { + let downcast_err = |expected: &str| { + ArrowError::CastError(format!( + "Physical array type mismatch: expected {} for logical type {:?}", + expected, + array.data_type() + )) + }; + + let encoder: Box = match ( + &self.decimal_encoding, + &self.timestamp_format, + array.data_type(), + ) { + (_, TimestampFormat::UnixMillis, DataType::Timestamp(TimeUnit::Nanosecond, _)) => { + let arr = array + .as_any() + .downcast_ref::() + .ok_or_else(|| downcast_err("TimestampNanosecondArray"))? + .clone(); + Box::new(UnixMillisEncoder::Nanos(arr)) + } + (_, TimestampFormat::UnixMillis, DataType::Timestamp(TimeUnit::Microsecond, _)) => { + let arr = array + .as_any() + .downcast_ref::() + .ok_or_else(|| downcast_err("TimestampMicrosecondArray"))? + .clone(); + Box::new(UnixMillisEncoder::Micros(arr)) + } + (_, TimestampFormat::UnixMillis, DataType::Timestamp(TimeUnit::Millisecond, _)) => { + let arr = array + .as_any() + .downcast_ref::() + .ok_or_else(|| downcast_err("TimestampMillisecondArray"))? + .clone(); + Box::new(UnixMillisEncoder::Millis(arr)) + } + (_, TimestampFormat::UnixMillis, DataType::Timestamp(TimeUnit::Second, _)) => { + let arr = array + .as_any() + .downcast_ref::() + .ok_or_else(|| downcast_err("TimestampSecondArray"))? + .clone(); + Box::new(UnixMillisEncoder::Seconds(arr)) + } + + (DecimalEncoding::String, _, DataType::Decimal128(_, _)) => { + let arr = array + .as_any() + .downcast_ref::() + .ok_or_else(|| downcast_err("Decimal128Array"))? + .clone(); + Box::new(DecimalEncoder::StringEncoder(arr)) + } + (DecimalEncoding::Bytes, _, DataType::Decimal128(_, _)) => { + let arr = array + .as_any() + .downcast_ref::() + .ok_or_else(|| downcast_err("Decimal128Array"))? + .clone(); + Box::new(DecimalEncoder::BytesEncoder(arr)) + } + + (_, _, DataType::Binary) => { + let arr = array + .as_any() + .downcast_ref::() + .ok_or_else(|| downcast_err("BinaryArray"))? + .clone(); + Box::new(BinaryEncoder(arr)) + } + + _ => return Ok(None), + }; + + Ok(Some(NullableEncoder::new(encoder, array.nulls().cloned()))) + } +} + +// --------------------------------------------------------------------------- +// Timestamp Encoders +// --------------------------------------------------------------------------- + +enum UnixMillisEncoder { + Nanos(TimestampNanosecondArray), + Micros(TimestampMicrosecondArray), + Millis(TimestampMillisecondArray), + Seconds(TimestampSecondArray), +} + +impl Encoder for UnixMillisEncoder { + fn encode(&mut self, idx: usize, out: &mut Vec) { + let millis = match self { + Self::Nanos(arr) => arr.value(idx) / 1_000_000, + Self::Micros(arr) => arr.value(idx) / 1_000, + Self::Millis(arr) => arr.value(idx), + Self::Seconds(arr) => arr.value(idx) * 1_000, + }; + + write!(out, "{millis}").expect("Writing integer to Vec buffer should never fail"); + } +} + +// --------------------------------------------------------------------------- +// Decimal Encoders +// --------------------------------------------------------------------------- + +enum DecimalEncoder { + StringEncoder(Decimal128Array), + BytesEncoder(Decimal128Array), +} + +impl Encoder for DecimalEncoder { + fn encode(&mut self, idx: usize, out: &mut Vec) { + match self { + Self::StringEncoder(arr) => { + out.push(b'"'); + out.extend_from_slice(arr.value_as_string(idx).as_bytes()); + out.push(b'"'); + } + Self::BytesEncoder(arr) => { + let bytes = arr.value(idx).to_be_bytes(); + let mut stack_buf = [0u8; 24]; + BASE64_STANDARD + .encode_slice(bytes, &mut stack_buf) + .expect("Base64 encode_slice size mismatch"); + + out.push(b'"'); + out.extend_from_slice(&stack_buf); + out.push(b'"'); + } + } + } +} + +// --------------------------------------------------------------------------- +// Binary Encoder +// --------------------------------------------------------------------------- + +struct BinaryEncoder(BinaryArray); + +impl Encoder for BinaryEncoder { + fn encode(&mut self, idx: usize, out: &mut Vec) { + let bytes = self.0.value(idx); + + let b64_len = bytes.len().saturating_add(2) / 3 * 4; + + out.push(b'"'); + + let start_idx = out.len(); + out.resize(start_idx + b64_len, 0); + + BASE64_STANDARD + .encode_slice(bytes, &mut out[start_idx..]) + .expect("Base64 encode_slice buffer capacity should match exactly"); + + out.push(b'"'); + } +} diff --git a/src/runtime/streaming/format/mod.rs b/src/runtime/streaming/format/mod.rs new file mode 100644 index 00000000..d5e63a9d --- /dev/null +++ b/src/runtime/streaming/format/mod.rs @@ -0,0 +1,20 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod config; +pub mod deserializer; +pub mod json_encoder; +pub mod serializer; + +pub use config::{BadDataPolicy, DecimalEncoding, Format, JsonFormat, TimestampFormat}; +pub use deserializer::DataDeserializer; +pub use serializer::DataSerializer; diff --git a/src/runtime/streaming/format/serializer.rs b/src/runtime/streaming/format/serializer.rs new file mode 100644 index 00000000..6aa47f17 --- /dev/null +++ b/src/runtime/streaming/format/serializer.rs @@ -0,0 +1,159 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{Context, Result}; +use arrow_array::cast::AsArray; +use arrow_array::{Array, RecordBatch, StructArray}; +use arrow_json::EncoderOptions; +use arrow_json::writer::make_encoder; +use arrow_schema::{Field, SchemaRef}; +use std::sync::Arc; +use tracing::warn; + +use super::config::{Format, JsonFormat}; +use super::json_encoder::CustomEncoderFactory; +use crate::sql::common::TIMESTAMP_FIELD; + +pub struct DataSerializer { + format: Format, + projection_indices: Vec, +} + +impl DataSerializer { + pub fn new(format: Format, schema: SchemaRef) -> Self { + let projection_indices: Vec = schema + .fields() + .iter() + .enumerate() + .filter(|(_, f)| f.name() != TIMESTAMP_FIELD) + .map(|(i, _)| i) + .collect(); + + Self { + format, + projection_indices, + } + } + + pub fn serialize(&self, batch: &RecordBatch) -> Result>> { + if batch.num_rows() == 0 { + return Ok(Vec::new()); + } + + let projected_batch = batch + .project(&self.projection_indices) + .context("Failed to project RecordBatch (removing timestamp column)")?; + + match &self.format { + Format::Json(config) => self + .serialize_json(config, &projected_batch) + .context("JSON serialization failed"), + Format::RawString => self + .serialize_raw_string(&projected_batch) + .context("RawString serialization failed"), + Format::RawBytes => self + .serialize_raw_bytes(&projected_batch) + .context("RawBytes serialization failed"), + } + } + + fn serialize_json(&self, config: &JsonFormat, batch: &RecordBatch) -> Result>> { + let num_rows = batch.num_rows(); + let array = StructArray::from(batch.clone()); + let field = Arc::new(Field::new_struct( + "", + batch.schema().fields().clone(), + false, + )); + + let encoder_factory = Arc::new(CustomEncoderFactory { + timestamp_format: config.timestamp_format.clone(), + decimal_encoding: config.decimal_encoding.clone(), + }); + + let options = EncoderOptions::default() + .with_explicit_nulls(true) + .with_encoder_factory(encoder_factory); + + let mut encoder = + make_encoder(&field, &array, &options).context("Failed to build Arrow JSON encoder")?; + + let mut results = Vec::with_capacity(num_rows); + + let mut shared_buf = Vec::with_capacity(512); + + for idx in 0..num_rows { + shared_buf.clear(); + encoder.encode(idx, &mut shared_buf); + + if !shared_buf.is_empty() { + results.push(shared_buf.to_vec()); + } else { + warn!( + row_index = idx, + "JSON encoder produced an empty buffer for row" + ); + } + } + Ok(results) + } + + fn serialize_raw_string(&self, batch: &RecordBatch) -> Result>> { + let value_idx = batch + .schema() + .index_of("value") + .context("RawString format requires a 'value' column in the schema")?; + + let string_array = batch + .column(value_idx) + .as_string_opt::() + .context("RawString 'value' column is physically not a valid Utf8 Array")?; + + let num_rows = batch.num_rows(); + let mut results = Vec::with_capacity(num_rows); + + for i in 0..num_rows { + if string_array.is_null(i) { + results.push(Vec::new()); + } else { + results.push(string_array.value(i).as_bytes().to_vec()); + } + } + + Ok(results) + } + + fn serialize_raw_bytes(&self, batch: &RecordBatch) -> Result>> { + let value_idx = batch + .schema() + .index_of("value") + .context("RawBytes format requires a 'value' column in the schema")?; + + let binary_array = batch + .column(value_idx) + .as_binary_opt::() + .context("RawBytes 'value' column is physically not a valid Binary Array")?; + + let num_rows = batch.num_rows(); + let mut results = Vec::with_capacity(num_rows); + + for i in 0..num_rows { + if binary_array.is_null(i) { + results.push(Vec::new()); + } else { + results.push(binary_array.value(i).to_vec()); + } + } + + Ok(results) + } +} diff --git a/src/runtime/streaming/job/edge_manager.rs b/src/runtime/streaming/job/edge_manager.rs new file mode 100644 index 00000000..00c94485 --- /dev/null +++ b/src/runtime/streaming/job/edge_manager.rs @@ -0,0 +1,100 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use anyhow::{Result, anyhow}; +use tokio::sync::mpsc; +use tracing::{debug, info, warn}; + +use crate::runtime::streaming::protocol::event::TrackedEvent; +use protocol::function_stream_graph::{FsEdge, FsNode}; + +const DEFAULT_CHANNEL_CAPACITY: usize = 2048; + +type TrackedEventEndpoints = ( + Vec>, + Vec>, +); + +pub struct EdgeManager { + endpoints: HashMap, +} + +impl EdgeManager { + pub fn build(nodes: &[FsNode], edges: &[FsEdge]) -> Self { + Self::build_with_capacity(nodes, edges, DEFAULT_CHANNEL_CAPACITY) + } + + pub fn build_with_capacity(nodes: &[FsNode], edges: &[FsEdge], capacity: usize) -> Self { + info!( + "Building EdgeManager for {} nodes and {} edges (channel capacity: {})", + nodes.len(), + edges.len(), + capacity + ); + + let mut tx_map: HashMap>> = + HashMap::with_capacity(nodes.len()); + let mut rx_map: HashMap>> = + HashMap::with_capacity(nodes.len()); + + for edge in edges { + let source_id = edge.source as u32; + let target_id = edge.target as u32; + + let (tx, rx) = mpsc::channel(capacity); + + tx_map.entry(source_id).or_default().push(tx); + rx_map.entry(target_id).or_default().push(rx); + + debug!( + "Created physical edge channel: Node {} -> Node {}", + source_id, target_id + ); + } + + let mut endpoints = HashMap::with_capacity(nodes.len()); + for node in nodes { + let id = node.node_index as u32; + + let inboxes = rx_map.remove(&id).unwrap_or_default(); + let outboxes = tx_map.remove(&id).unwrap_or_default(); + + endpoints.insert(id, (inboxes, outboxes)); + } + + for remaining_target in rx_map.keys() { + warn!( + "Topology Warning: Found incoming edges pointing to non-existent Node {}", + remaining_target + ); + } + for remaining_source in tx_map.keys() { + warn!( + "Topology Warning: Found outgoing edges coming from non-existent Node {}", + remaining_source + ); + } + + Self { endpoints } + } + + pub fn take_endpoints(&mut self, id: u32) -> Result { + self.endpoints + .remove(&id) + .ok_or_else(|| anyhow!( + "Topology Error: Endpoints for Node {} not found or already taken. Execution Graph may be inconsistent.", + id + )) + } +} diff --git a/src/runtime/streaming/job/job_manager.rs b/src/runtime/streaming/job/job_manager.rs new file mode 100644 index 00000000..3082dc56 --- /dev/null +++ b/src/runtime/streaming/job/job_manager.rs @@ -0,0 +1,512 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::sync::{Arc, OnceLock, RwLock}; + +use anyhow::{Context, Result, anyhow, bail, ensure}; +use tokio::sync::mpsc; +use tokio_stream::wrappers::ReceiverStream; +use tracing::{error, info, warn}; + +use protocol::function_stream_graph::{ChainedOperator, FsProgram}; + +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::{ConstructedOperator, Operator}; +use crate::runtime::streaming::api::source::SourceOperator; +use crate::runtime::streaming::execution::{ChainBuilder, Pipeline, SourceDriver}; +use crate::runtime::streaming::factory::OperatorFactory; +use crate::runtime::streaming::job::edge_manager::EdgeManager; +use crate::runtime::streaming::job::models::{ + PhysicalExecutionGraph, PhysicalPipeline, PipelineStatus, StreamingJobRollupStatus, +}; +use crate::runtime::streaming::memory::MemoryPool; +use crate::runtime::streaming::network::endpoint::{BoxedEventStream, PhysicalSender}; +use crate::runtime::streaming::protocol::control::{ControlCommand, StopMode}; + +#[derive(Debug, Clone)] +pub struct StreamingJobSummary { + pub job_id: String, + pub status: StreamingJobRollupStatus, + pub pipeline_count: i32, + pub uptime_secs: u64, +} + +#[derive(Debug, Clone)] +pub struct PipelineDetail { + pub pipeline_id: u32, + pub status: String, +} + +#[derive(Debug, Clone)] +pub struct StreamingJobDetail { + pub job_id: String, + pub status: StreamingJobRollupStatus, + pub pipeline_count: i32, + pub uptime_secs: u64, + pub pipelines: Vec, + pub program: FsProgram, +} + +static GLOBAL_JOB_MANAGER: OnceLock> = OnceLock::new(); + +pub struct JobManager { + active_jobs: Arc>>, + operator_factory: Arc, + memory_pool: Arc, +} + +struct PreparedChain { + source: Option>, + operators: Vec>, +} + +enum PipelineRunner { + Source(SourceDriver), + Standard(Pipeline), +} + +impl PipelineRunner { + async fn run(self) -> Result<(), crate::runtime::streaming::error::RunError> { + match self { + PipelineRunner::Source(driver) => driver.run().await, + PipelineRunner::Standard(pipeline) => pipeline.run().await, + } + } +} + +impl JobManager { + pub fn new(operator_factory: Arc, max_memory_bytes: usize) -> Self { + Self { + active_jobs: Arc::new(RwLock::new(HashMap::new())), + operator_factory, + memory_pool: MemoryPool::new(max_memory_bytes), + } + } + + pub fn init(factory: Arc, memory_bytes: usize) -> Result<()> { + GLOBAL_JOB_MANAGER + .set(Arc::new(Self::new(factory, memory_bytes))) + .map_err(|_| anyhow!("JobManager singleton already initialized")) + } + + pub fn global() -> Result> { + GLOBAL_JOB_MANAGER + .get() + .cloned() + .ok_or_else(|| anyhow!("JobManager not initialized. Call init() first.")) + } + + pub async fn submit_job(&self, job_id: String, program: FsProgram) -> Result { + let mut edge_manager = EdgeManager::build(&program.nodes, &program.edges); + let mut pipelines = HashMap::with_capacity(program.nodes.len()); + + for node in &program.nodes { + let pipeline_id = node.node_index as u32; + + let pipeline = self + .build_and_spawn_pipeline( + job_id.clone(), + pipeline_id, + &node.operators, + &mut edge_manager, + ) + .with_context(|| { + format!( + "Failed to build pipeline {} for job {}", + pipeline_id, job_id + ) + })?; + + pipelines.insert(pipeline_id, pipeline); + } + + let graph = PhysicalExecutionGraph { + job_id: job_id.clone(), + program, + pipelines, + start_time: std::time::Instant::now(), + }; + + let mut jobs_guard = self + .active_jobs + .write() + .map_err(|e| anyhow!("Active jobs lock poisoned: {}", e))?; + jobs_guard.insert(job_id.clone(), graph); + + info!(job_id = %job_id, "Job submitted successfully."); + Ok(job_id) + } + + pub async fn stop_job(&self, job_id: &str, mode: StopMode) -> Result<()> { + let control_senders = self.extract_control_senders(job_id)?; + + for tx in control_senders { + let _ = tx.send(ControlCommand::Stop { mode: mode.clone() }).await; + } + + info!(job_id = %job_id, mode = ?mode, "Job stop signal dispatched."); + Ok(()) + } + + pub async fn remove_job(&self, job_id: &str, mode: StopMode) -> Result<()> { + self.stop_job(job_id, mode).await?; + + let mut jobs_guard = self + .active_jobs + .write() + .map_err(|_| anyhow!("Active jobs lock poisoned"))?; + + if jobs_guard.remove(job_id).is_some() { + info!(job_id = %job_id, "Job removed from JobManager."); + Ok(()) + } else { + bail!("Job not found during removal: {}", job_id) + } + } + + pub fn has_job(&self, job_id: &str) -> bool { + self.active_jobs + .read() + .map(|guard| guard.contains_key(job_id)) + .unwrap_or(false) + } + + pub fn list_jobs(&self) -> Vec { + let Ok(jobs_guard) = self.active_jobs.read() else { + warn!("Failed to read active_jobs due to lock poisoning."); + return vec![]; + }; + + jobs_guard + .values() + .map(|graph| { + let pipeline_count = graph.pipelines.len() as i32; + let uptime_secs = graph.start_time.elapsed().as_secs(); + let status = Self::aggregate_pipeline_status(&graph.pipelines); + StreamingJobSummary { + job_id: graph.job_id.clone(), + status, + pipeline_count, + uptime_secs, + } + }) + .collect() + } + + pub fn get_job_detail(&self, job_id: &str) -> Option { + let jobs_guard = self.active_jobs.read().ok()?; + let graph = jobs_guard.get(job_id)?; + + let uptime_secs = graph.start_time.elapsed().as_secs(); + let overall_status = Self::aggregate_pipeline_status(&graph.pipelines); + + let pipeline_details: Vec = graph + .pipelines + .iter() + .map(|(id, pipeline)| { + let status = pipeline + .status + .read() + .map(|s| s.clone()) + .unwrap_or_else(|_| PipelineStatus::Failed { + error: "Status lock poisoned".into(), + is_panic: true, + }); + + PipelineDetail { + pipeline_id: *id, + status: format!("{status:?}"), + } + }) + .collect(); + + Some(StreamingJobDetail { + job_id: graph.job_id.clone(), + status: overall_status, + pipeline_count: graph.pipelines.len() as i32, + uptime_secs, + pipelines: pipeline_details, + program: graph.program.clone(), + }) + } + + pub fn get_pipeline_statuses(&self, job_id: &str) -> Option> { + let jobs_guard = self.active_jobs.read().ok()?; + let graph = jobs_guard.get(job_id)?; + + Some( + graph + .pipelines + .iter() + .map(|(id, pipeline)| { + let status = pipeline + .status + .read() + .map(|s| s.clone()) + .unwrap_or_else(|_| PipelineStatus::Failed { + error: "Status lock poisoned".into(), + is_panic: true, + }); + (*id, status) + }) + .collect(), + ) + } + + fn aggregate_pipeline_status( + pipelines: &HashMap, + ) -> StreamingJobRollupStatus { + let mut running = 0u32; + let mut failed = 0u32; + let mut finished = 0u32; + let mut initializing = 0u32; + + for pipeline in pipelines.values() { + let status = pipeline + .status + .read() + .map(|s| s.clone()) + .unwrap_or_else(|_| PipelineStatus::Failed { + error: "Status lock poisoned".into(), + is_panic: true, + }); + + match status { + PipelineStatus::Running => running += 1, + PipelineStatus::Failed { .. } => failed += 1, + PipelineStatus::Finished => finished += 1, + PipelineStatus::Initializing => initializing += 1, + PipelineStatus::Stopping => {} + } + } + + let n = pipelines.len() as u32; + if failed > 0 { + StreamingJobRollupStatus::Degraded + } else if running > 0 && running == n { + StreamingJobRollupStatus::Running + } else if finished == n { + StreamingJobRollupStatus::Finished + } else if initializing > 0 { + StreamingJobRollupStatus::Initializing + } else { + StreamingJobRollupStatus::Reconciling + } + } + fn extract_control_senders(&self, job_id: &str) -> Result>> { + let jobs_guard = self + .active_jobs + .read() + .map_err(|_| anyhow!("Active jobs lock poisoned"))?; + + let graph = jobs_guard + .get(job_id) + .ok_or_else(|| anyhow!("Job not found: {job_id}"))?; + + Ok(graph + .pipelines + .values() + .map(|p| p.control_tx.clone()) + .collect()) + } + + fn build_and_spawn_pipeline( + &self, + job_id: String, + pipeline_id: u32, + operators: &[ChainedOperator], + edge_manager: &mut EdgeManager, + ) -> Result { + let (raw_inboxes, raw_outboxes) = + edge_manager.take_endpoints(pipeline_id).with_context(|| { + format!( + "Failed to retrieve network endpoints for pipeline {}", + pipeline_id + ) + })?; + + let physical_outboxes: Vec = raw_outboxes + .into_iter() + .map(PhysicalSender::Local) + .collect(); + + let physical_inboxes: Vec = raw_inboxes + .into_iter() + .map(|rx| Box::pin(ReceiverStream::new(rx)) as _) + .collect(); + + let chain = self.build_operator_chain(operators).with_context(|| { + format!( + "Failed to build operator chain for pipeline {}", + pipeline_id + ) + })?; + + ensure!( + chain.source.is_some() || !physical_inboxes.is_empty(), + "Topology Error: Pipeline '{}' contains no source and has no upstream inputs (Dead end).", + pipeline_id + ); + ensure!( + chain.source.is_none() || physical_inboxes.is_empty(), + "Topology Error: Source pipeline '{}' cannot have upstream inputs.", + pipeline_id + ); + + let (control_tx, control_rx) = mpsc::channel(64); + let status = Arc::new(RwLock::new(PipelineStatus::Initializing)); + + let subtask_index = 0; + let parallelism = 1; + let ctx = TaskContext::new( + job_id.clone(), + pipeline_id, + subtask_index, + parallelism, + physical_outboxes, + Arc::clone(&self.memory_pool), + ); + + let runner = if let Some(source) = chain.source { + let chain_head = ChainBuilder::build(chain.operators); + PipelineRunner::Source(SourceDriver::new(source, chain_head, ctx, control_rx)) + } else { + PipelineRunner::Standard( + Pipeline::new(chain.operators, ctx, physical_inboxes, control_rx).with_context( + || format!("Failed to initialize Standard Pipeline {}", pipeline_id), + )?, + ) + }; + + let handle = self + .spawn_worker_thread(job_id, pipeline_id, runner, Arc::clone(&status)) + .with_context(|| format!("Failed to spawn OS thread for pipeline {}", pipeline_id))?; + + Ok(PhysicalPipeline { + pipeline_id, + handle: Some(handle), + status, + control_tx, + }) + } + + fn build_operator_chain(&self, operator_configs: &[ChainedOperator]) -> Result { + let mut source: Option> = None; + let mut chain = Vec::with_capacity(operator_configs.len()); + + for op_config in operator_configs { + let constructed = self + .operator_factory + .create_operator(&op_config.operator_name, &op_config.operator_config)?; + + match constructed { + ConstructedOperator::Operator(msg_op) => chain.push(msg_op), + ConstructedOperator::Source(src_op) => { + if source.is_some() { + bail!("Topology Error: Multiple sources in one physical chain."); + } + if !chain.is_empty() { + bail!( + "Topology Error: Source '{}' must be the first operator.", + op_config.operator_name + ); + } + source = Some(src_op); + } + } + } + Ok(PreparedChain { + source, + operators: chain, + }) + } + + fn spawn_worker_thread( + &self, + job_id: String, + pipeline_id: u32, + runner: PipelineRunner, + status: Arc>, + ) -> Result> { + let thread_name = format!("Task-{job_id}-{pipeline_id}"); + + let handle = std::thread::Builder::new() + .name(thread_name) + .spawn(move || { + if let Ok(mut st) = status.write() { + *st = PipelineStatus::Running; + } + + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .expect("Failed to build current-thread Tokio runtime"); + + let execution_result = + std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + rt.block_on(async move { + runner + .run() + .await + .map_err(|e| anyhow!("Execution failed: {e}")) + }) + })); + + Self::handle_pipeline_exit(&job_id, pipeline_id, execution_result, &status); + })?; + + Ok(handle) + } + + fn handle_pipeline_exit( + job_id: &str, + pipeline_id: u32, + thread_result: std::thread::Result>, + status: &RwLock, + ) { + let (final_status, is_fatal) = match thread_result { + Ok(Ok(_)) => { + info!(job_id = %job_id, pipeline_id = pipeline_id, "Pipeline finished gracefully."); + (PipelineStatus::Finished, false) + } + Ok(Err(e)) => { + error!(job_id = %job_id, pipeline_id = pipeline_id, error = %e, "Pipeline failed."); + ( + PipelineStatus::Failed { + error: e.to_string(), + is_panic: false, + }, + true, + ) + } + Err(_) => { + error!(job_id = %job_id, pipeline_id = pipeline_id, "Pipeline thread panicked!"); + ( + PipelineStatus::Failed { + error: "Unexpected panic in task thread".into(), + is_panic: true, + }, + true, + ) + } + }; + + if let Ok(mut st) = status.write() { + *st = final_status; + } + + if is_fatal { + warn!(job_id = %job_id, pipeline_id = pipeline_id, "Pipeline failure detected. Job degraded."); + } + } +} diff --git a/src/runtime/streaming/job/mod.rs b/src/runtime/streaming/job/mod.rs new file mode 100644 index 00000000..02e0343c --- /dev/null +++ b/src/runtime/streaming/job/mod.rs @@ -0,0 +1,17 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod edge_manager; +pub mod job_manager; +pub mod models; + +pub use job_manager::{JobManager, StreamingJobSummary}; diff --git a/src/runtime/streaming/job/models.rs b/src/runtime/streaming/job/models.rs new file mode 100644 index 00000000..f4e2f280 --- /dev/null +++ b/src/runtime/streaming/job/models.rs @@ -0,0 +1,89 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::fmt; +use std::sync::{Arc, RwLock}; +use std::thread::JoinHandle; +use std::time::Instant; + +use protocol::function_stream_graph::FsProgram; +use tokio::sync::mpsc; + +use crate::runtime::streaming::protocol::control::ControlCommand; + +#[derive(Debug, Clone, PartialEq)] +pub enum PipelineStatus { + Initializing, + Running, + Failed { error: String, is_panic: bool }, + Finished, + Stopping, +} + +/// Aggregated lifecycle / health label for an entire streaming job, computed from all +/// [`PhysicalPipeline`] [`PipelineStatus`] values. +/// +/// This is a **roll-up**, not a per-pipeline state: it answers “how is the job as a whole +/// doing?” for listing, SQL result sets, and similar surfaces. Wire representation is a short +/// uppercase token (stable for clients). +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum StreamingJobRollupStatus { + /// At least one pipeline has failed (error exit or panic). Other pipelines may still be + /// running or stopping; the job requires operator attention. + Degraded, + /// Every pipeline is in [`PipelineStatus::Running`]: steady-state processing. + Running, + /// Every pipeline has reached [`PipelineStatus::Finished`]: the job graph has quiesced + /// successfully (e.g. bounded job or graceful end-of-stream). + Finished, + /// No pipeline has failed, at least one is still [`PipelineStatus::Initializing`], and the + /// job is not yet uniformly running—startup is still in progress. + Initializing, + /// No failures, but pipelines are in a **mixed** non-failed combination (e.g. some running + /// and some stopping, or counts that do not match the all-running / all-finished rules). + /// Often transient during control operations or uneven pipeline progress. + Reconciling, +} + +impl StreamingJobRollupStatus { + /// Stable token exposed in APIs and SQL output (historical uppercase spelling). + pub const fn as_str(self) -> &'static str { + match self { + Self::Degraded => "DEGRADED", + Self::Running => "RUNNING", + Self::Finished => "FINISHED", + Self::Initializing => "INITIALIZING", + Self::Reconciling => "RECONCILING", + } + } +} + +impl fmt::Display for StreamingJobRollupStatus { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(self.as_str()) + } +} + +pub struct PhysicalPipeline { + pub pipeline_id: u32, + pub handle: Option>, + pub status: Arc>, + pub control_tx: mpsc::Sender, +} + +pub struct PhysicalExecutionGraph { + pub job_id: String, + pub program: FsProgram, + pub pipelines: HashMap, + pub start_time: Instant, +} diff --git a/src/runtime/streaming/memory/mod.rs b/src/runtime/streaming/memory/mod.rs new file mode 100644 index 00000000..45fc3194 --- /dev/null +++ b/src/runtime/streaming/memory/mod.rs @@ -0,0 +1,17 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod pool; +pub mod ticket; + +pub use pool::MemoryPool; +pub use ticket::MemoryTicket; diff --git a/src/runtime/streaming/memory/pool.rs b/src/runtime/streaming/memory/pool.rs new file mode 100644 index 00000000..b6a06ad2 --- /dev/null +++ b/src/runtime/streaming/memory/pool.rs @@ -0,0 +1,89 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use parking_lot::Mutex; +use std::sync::Arc; +use std::sync::atomic::{AtomicUsize, Ordering}; +use tokio::sync::Notify; +use tracing::{debug, warn}; + +use super::ticket::MemoryTicket; + +#[derive(Debug)] +pub struct MemoryPool { + max_bytes: usize, + used_bytes: AtomicUsize, + available_bytes: Mutex, + notify: Notify, +} + +impl MemoryPool { + pub fn new(max_bytes: usize) -> Arc { + Arc::new(Self { + max_bytes, + used_bytes: AtomicUsize::new(0), + available_bytes: Mutex::new(max_bytes), + notify: Notify::new(), + }) + } + + pub fn usage_metrics(&self) -> (usize, usize) { + (self.used_bytes.load(Ordering::Relaxed), self.max_bytes) + } + + pub async fn request_memory(self: &Arc, bytes: usize) -> MemoryTicket { + if bytes == 0 { + return MemoryTicket::new(0, self.clone()); + } + + if bytes > self.max_bytes { + warn!( + "Requested memory ({} B) exceeds total pool size ({} B)! \ + Permitting to avoid pipeline deadlock, but OOM risk is critical.", + bytes, self.max_bytes + ); + self.used_bytes.fetch_add(bytes, Ordering::Relaxed); + return MemoryTicket::new(bytes, self.clone()); + } + + loop { + { + let mut available = self.available_bytes.lock(); + if *available >= bytes { + *available -= bytes; + self.used_bytes.fetch_add(bytes, Ordering::Relaxed); + return MemoryTicket::new(bytes, self.clone()); + } + } + + debug!( + "Backpressure engaged: waiting for {} bytes to be freed...", + bytes + ); + self.notify.notified().await; + } + } + + pub(crate) fn release(&self, bytes: usize) { + if bytes == 0 { + return; + } + + { + let mut available = self.available_bytes.lock(); + *available += bytes; + } + + self.used_bytes.fetch_sub(bytes, Ordering::Relaxed); + self.notify.notify_waiters(); + } +} diff --git a/src/runtime/streaming/memory/ticket.rs b/src/runtime/streaming/memory/ticket.rs new file mode 100644 index 00000000..cb105be0 --- /dev/null +++ b/src/runtime/streaming/memory/ticket.rs @@ -0,0 +1,33 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use super::pool::MemoryPool; + +#[derive(Debug)] +pub struct MemoryTicket { + bytes: usize, + pool: Arc, +} + +impl MemoryTicket { + pub(crate) fn new(bytes: usize, pool: Arc) -> Self { + Self { bytes, pool } + } +} + +impl Drop for MemoryTicket { + fn drop(&mut self) { + self.pool.release(self.bytes); + } +} diff --git a/src/runtime/streaming/mod.rs b/src/runtime/streaming/mod.rs new file mode 100644 index 00000000..7e0ba57a --- /dev/null +++ b/src/runtime/streaming/mod.rs @@ -0,0 +1,27 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Streaming actor runtime (vendored from Arroyo `arroyo-actor-runtime`). + +pub mod api; +pub mod error; +pub mod execution; +pub mod factory; +pub mod format; +pub mod job; +pub mod memory; +pub mod network; +pub mod operators; +pub mod protocol; + +pub use protocol::StreamOutput; diff --git a/src/runtime/streaming/network/endpoint.rs b/src/runtime/streaming/network/endpoint.rs new file mode 100644 index 00000000..ae75e6fc --- /dev/null +++ b/src/runtime/streaming/network/endpoint.rs @@ -0,0 +1,64 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::runtime::streaming::protocol::event::StreamEvent; +use crate::runtime::streaming::protocol::event::TrackedEvent; +use anyhow::{Result, anyhow}; +use std::pin::Pin; +use tokio::sync::mpsc; +use tokio_stream::Stream; +use tracing::debug; + +// ======================================================================== +// ======================================================================== + +#[derive(Clone)] +pub struct RemoteSenderStub { + pub target_addr: String, +} + +impl RemoteSenderStub { + pub async fn send_over_network(&self, _event: &StreamEvent) -> Result<()> { + unimplemented!("Remote network transport is not yet implemented") + } +} + +// ======================================================================== +// ======================================================================== + +#[derive(Clone)] +pub enum PhysicalSender { + Local(mpsc::Sender), + Remote(RemoteSenderStub), +} + +impl PhysicalSender { + pub async fn send(&self, tracked_event: TrackedEvent) -> Result<()> { + match self { + PhysicalSender::Local(tx) => { + tx.send(tracked_event).await.map_err(|_| { + anyhow!("Local channel closed! Downstream task may have crashed.") + })?; + } + PhysicalSender::Remote(stub) => { + stub.send_over_network(&tracked_event.event).await?; + debug!("Sent event over network, local memory ticket will be released."); + } + } + Ok(()) + } +} + +// ======================================================================== +// ======================================================================== + +pub type BoxedEventStream = Pin + Send>>; diff --git a/src/runtime/streaming/network/environment.rs b/src/runtime/streaming/network/environment.rs new file mode 100644 index 00000000..c36bc062 --- /dev/null +++ b/src/runtime/streaming/network/environment.rs @@ -0,0 +1,57 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::endpoint::{BoxedEventStream, PhysicalSender}; +use std::collections::HashMap; + +pub type VertexId = u32; +pub type SubtaskIndex = u32; + +pub struct NetworkEnvironment { + pub outboxes: HashMap<(VertexId, SubtaskIndex), Vec>, + pub inboxes: HashMap<(VertexId, SubtaskIndex), Vec>, +} + +impl Default for NetworkEnvironment { + fn default() -> Self { + Self::new() + } +} + +impl NetworkEnvironment { + pub fn new() -> Self { + Self { + outboxes: HashMap::new(), + inboxes: HashMap::new(), + } + } + + pub fn take_outboxes( + &mut self, + vertex_id: VertexId, + subtask_idx: SubtaskIndex, + ) -> Vec { + self.outboxes + .remove(&(vertex_id, subtask_idx)) + .unwrap_or_default() + } + + pub fn take_inboxes( + &mut self, + vertex_id: VertexId, + subtask_idx: SubtaskIndex, + ) -> Vec { + self.inboxes + .remove(&(vertex_id, subtask_idx)) + .unwrap_or_default() + } +} diff --git a/src/runtime/streaming/network/mod.rs b/src/runtime/streaming/network/mod.rs new file mode 100644 index 00000000..6a7adc90 --- /dev/null +++ b/src/runtime/streaming/network/mod.rs @@ -0,0 +1,14 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod endpoint; +pub mod environment; diff --git a/src/runtime/streaming/operators/grouping/incremental_aggregate.rs b/src/runtime/streaming/operators/grouping/incremental_aggregate.rs new file mode 100644 index 00000000..625cdee5 --- /dev/null +++ b/src/runtime/streaming/operators/grouping/incremental_aggregate.rs @@ -0,0 +1,912 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::sql::common::constants::updating_state_field; +use anyhow::{Result, bail}; +use arrow::compute::max_array; +use arrow::row::{RowConverter, SortField}; +use arrow_array::builder::{ + BinaryBuilder, TimestampNanosecondBuilder, UInt32Builder, UInt64Builder, +}; +use arrow_array::cast::AsArray; +use arrow_array::types::UInt64Type; +use arrow_array::{Array, ArrayRef, BooleanArray, RecordBatch, StructArray}; +use arrow_schema::{DataType, Field, FieldRef, Schema, SchemaBuilder, TimeUnit}; +use datafusion::common::{Result as DFResult, ScalarValue}; +use datafusion::physical_expr::aggregate::AggregateFunctionExpr; +use datafusion::physical_plan::{Accumulator, PhysicalExpr}; +use datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec; +use datafusion_proto::physical_plan::from_proto::parse_physical_expr; +use datafusion_proto::protobuf::PhysicalExprNode; +use datafusion_proto::protobuf::PhysicalPlanNode; +use datafusion_proto::protobuf::physical_plan_node::PhysicalPlanType; +use itertools::Itertools; +use prost::Message; +use protocol::function_stream_graph::UpdatingAggregateOperator; +use std::collections::HashSet; +use std::sync::LazyLock; +use std::time::{Duration, Instant, SystemTime}; +use std::{collections::HashMap, mem, sync::Arc}; +use tracing::{debug, warn}; +// ========================================================================= +// ========================================================================= +use crate::runtime::streaming::StreamOutput; +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::Operator; +use crate::runtime::streaming::factory::Registry; +use crate::runtime::streaming::operators::{Key, UpdatingCache}; +use crate::runtime::util::decode_aggregate; +use crate::sql::common::{ + CheckpointBarrier, FsSchema, TIMESTAMP_FIELD, UPDATING_META_FIELD, Watermark, to_nanos, +}; +use crate::sql::physical::updating_meta_fields; + +#[derive(Debug, Copy, Clone)] +struct BatchData { + count: u64, + generation: u64, +} + +impl BatchData { + fn new(generation: u64) -> Self { + Self { + count: 1, + generation, + } + } + + fn inc(&mut self) { + self.count += 1; + self.generation += 1; + } + + fn dec(&mut self) { + self.count = self.count.saturating_sub(1); + self.generation += 1; + } +} + +#[derive(Debug)] +enum IncrementalState { + Sliding { + expr: Arc, + accumulator: Box, + }, + Batch { + expr: Arc, + data: HashMap, + row_converter: Arc, + changed_values: HashSet, + }, +} + +impl IncrementalState { + fn update_batch(&mut self, new_generation: u64, batch: &[ArrayRef]) -> DFResult<()> { + match self { + IncrementalState::Sliding { accumulator, .. } => { + accumulator.update_batch(batch)?; + } + IncrementalState::Batch { + data, + row_converter, + changed_values, + .. + } => { + for r in row_converter.convert_columns(batch)?.iter() { + if data.contains_key(r.as_ref()) { + data.get_mut(r.as_ref()).unwrap().inc(); + changed_values.insert(data.get_key_value(r.as_ref()).unwrap().0.clone()); + } else { + let key = Key(Arc::new(r.as_ref().to_vec())); + data.insert(key.clone(), BatchData::new(new_generation)); + changed_values.insert(key); + } + } + } + } + Ok(()) + } + + fn retract_batch(&mut self, batch: &[ArrayRef]) -> DFResult<()> { + match self { + IncrementalState::Sliding { accumulator, .. } => accumulator.retract_batch(batch), + IncrementalState::Batch { + data, + row_converter, + changed_values, + .. + } => { + for r in row_converter.convert_columns(batch)?.iter() { + match data.get(r.as_ref()).map(|d| d.count) { + Some(0) => { + debug!( + "tried to retract value for key with count 0; implies append lost" + ); + } + Some(_) => { + data.get_mut(r.as_ref()).unwrap().dec(); + changed_values + .insert(data.get_key_value(r.as_ref()).unwrap().0.clone()); + } + None => { + debug!("tried to retract value for missing key: implies append lost"); + } + } + } + Ok(()) + } + } + } + + fn evaluate(&mut self) -> DFResult { + match self { + IncrementalState::Sliding { accumulator, .. } => accumulator.evaluate(), + IncrementalState::Batch { + expr, + data, + row_converter, + .. + } => { + let parser = row_converter.parser(); + let input = row_converter.convert_rows( + data.iter() + .filter(|(_, c)| c.count > 0) + .map(|(v, _)| parser.parse(&v.0)), + )?; + let mut acc = expr.create_accumulator()?; + acc.update_batch(&input)?; + acc.evaluate_mut() + } + } + } +} + +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +enum AccumulatorType { + Sliding, + Batch, +} + +impl AccumulatorType { + fn state_fields(&self, agg: &AggregateFunctionExpr) -> DFResult> { + Ok(match self { + AccumulatorType::Sliding => agg.sliding_state_fields()?, + AccumulatorType::Batch => vec![], + }) + } +} + +#[derive(Debug)] +struct Aggregator { + func: Arc, + accumulator_type: AccumulatorType, + row_converter: Arc, + state_cols: Vec, +} + +// ========================================================================= +// ========================================================================= + +pub struct IncrementalAggregatingFunc { + flush_interval: Duration, + metadata_expr: Arc, + aggregates: Vec, + accumulators: UpdatingCache>, + updated_keys: HashMap>>, + + input_schema: Arc, + has_routing_keys: bool, + + sliding_state_schema: Arc, + batch_state_schema: Arc, + schema_without_metadata: Arc, + final_output_schema: Arc, + ttl: Duration, + key_converter: RowConverter, + new_generation: u64, +} + +static GLOBAL_KEY: LazyLock>> = LazyLock::new(|| Arc::new(Vec::new())); + +impl IncrementalAggregatingFunc { + fn update_batch( + &mut self, + key: &[u8], + batch: &[Vec], + idx: Option, + ) -> DFResult<()> { + self.accumulators + .modify_and_update(key, Instant::now(), |values| { + for (inputs, accs) in batch.iter().zip(values.iter_mut()) { + let values = if let Some(idx) = idx { + &inputs.iter().map(|c| c.slice(idx, 1)).collect() + } else { + inputs + }; + accs.update_batch(self.new_generation, values)?; + } + Ok(()) + }) + .expect("tried to update for non-existent key") + } + + fn retract_batch( + &mut self, + key: &[u8], + batch: &[Vec], + idx: Option, + ) -> DFResult<()> { + self.accumulators + .modify(key, |values| { + for (inputs, accs) in batch.iter().zip(values.iter_mut()) { + let values = if let Some(idx) = idx { + &inputs.iter().map(|c| c.slice(idx, 1)).collect() + } else { + inputs + }; + accs.retract_batch(values)?; + } + Ok::<(), datafusion::common::DataFusionError>(()) + }) + .expect("tried to retract state for non-existent key")?; + Ok(()) + } + + fn evaluate(&mut self, key: &[u8]) -> DFResult> { + self.accumulators + .get_mut(key) + .expect("tried to evaluate non-existent key") + .iter_mut() + .map(|s| s.evaluate()) + .collect::>() + } + + fn get_retracts(batch: &RecordBatch) -> Option<&BooleanArray> { + if let Some(meta_col) = batch.column_by_name(UPDATING_META_FIELD) { + let meta_struct = meta_col + .as_any() + .downcast_ref::() + .expect("_updating_meta must be StructArray"); + + let is_retract_array = meta_struct + .column_by_name(updating_state_field::IS_RETRACT) + .expect("meta struct must have is_retract"); + + Some( + is_retract_array + .as_any() + .downcast_ref::() + .expect("is_retract must be BooleanArray"), + ) + } else { + None + } + } + + fn make_accumulators(&self) -> Vec { + self.aggregates + .iter() + .map(|agg| match agg.accumulator_type { + AccumulatorType::Sliding => IncrementalState::Sliding { + expr: agg.func.clone(), + accumulator: agg.func.create_sliding_accumulator().unwrap(), + }, + AccumulatorType::Batch => IncrementalState::Batch { + expr: agg.func.clone(), + data: Default::default(), + row_converter: agg.row_converter.clone(), + changed_values: Default::default(), + }, + }) + .collect() + } + + fn compute_inputs(&self, batch: &RecordBatch) -> Vec> { + self.aggregates + .iter() + .map(|agg| { + agg.func + .expressions() + .iter() + .map(|ex| { + ex.evaluate(batch) + .unwrap() + .into_array(batch.num_rows()) + .unwrap() + }) + .collect::>() + }) + .collect::>() + } + + fn global_aggregate(&mut self, batch: &RecordBatch) -> Result<()> { + let retracts = Self::get_retracts(batch); + let aggregate_input_cols = self.compute_inputs(batch); + + let mut first = false; + if !self + .accumulators + .contains_key(GLOBAL_KEY.as_ref().as_slice()) + { + first = true; + self.accumulators.insert( + GLOBAL_KEY.clone(), + Instant::now(), + self.new_generation, + self.make_accumulators(), + ); + } + + if !self + .updated_keys + .contains_key(GLOBAL_KEY.as_ref().as_slice()) + { + if first { + self.updated_keys.insert(Key(GLOBAL_KEY.clone()), None); + } else { + let v = Some(self.evaluate(GLOBAL_KEY.as_ref().as_slice())?); + self.updated_keys.insert(Key(GLOBAL_KEY.clone()), v); + } + } + + if let Some(retracts) = retracts { + for (i, r) in retracts.iter().enumerate() { + if r.unwrap_or_default() { + self.retract_batch( + GLOBAL_KEY.as_ref().as_slice(), + &aggregate_input_cols, + Some(i), + )?; + } else { + self.update_batch( + GLOBAL_KEY.as_ref().as_slice(), + &aggregate_input_cols, + Some(i), + )?; + } + } + } else { + self.update_batch(GLOBAL_KEY.as_ref().as_slice(), &aggregate_input_cols, None) + .unwrap(); + } + Ok(()) + } + + fn keyed_aggregate(&mut self, batch: &RecordBatch) -> Result<()> { + let retracts = Self::get_retracts(batch); + + let sort_columns = &self + .input_schema + .sort_columns(batch, false) + .into_iter() + .map(|e| e.values) + .collect::>(); + + let keys = self.key_converter.convert_columns(sort_columns).unwrap(); + + for k in &keys { + if !self.updated_keys.contains_key(k.as_ref()) { + if let Some((key, accs)) = self.accumulators.get_mut_key_value(k.as_ref()) { + self.updated_keys.insert( + key, + Some( + accs.iter_mut() + .map(|s| s.evaluate()) + .collect::>()?, + ), + ); + } else { + self.updated_keys + .insert(Key(Arc::new(k.as_ref().to_vec())), None); + } + } + } + + let aggregate_input_cols = self.compute_inputs(batch); + + for (i, key) in keys.iter().enumerate() { + if !self.accumulators.contains_key(key.as_ref()) { + self.accumulators.insert( + Arc::new(key.as_ref().to_vec()), + Instant::now(), + 0, + self.make_accumulators(), + ); + }; + + let retract = retracts.map(|r| r.value(i)).unwrap_or_default(); + if retract { + self.retract_batch(key.as_ref(), &aggregate_input_cols, Some(i))?; + } else { + self.update_batch(key.as_ref(), &aggregate_input_cols, Some(i))?; + } + } + Ok(()) + } + + // ========================================================================= + // ========================================================================= + + fn checkpoint_sliding(&mut self) -> DFResult>> { + if self.updated_keys.is_empty() { + return Ok(None); + } + + let mut states = vec![vec![]; self.sliding_state_schema.schema.fields.len()]; + let parser = self.key_converter.parser(); + let mut generation_builder = UInt64Builder::with_capacity(self.updated_keys.len()); + + let mut cols = self + .key_converter + .convert_rows(self.updated_keys.keys().map(|k| { + let (accumulators, generation) = + self.accumulators.get_mut_generation(k.0.as_ref()).unwrap(); + generation_builder.append_value(generation); + + for (state, agg) in accumulators.iter_mut().zip(self.aggregates.iter()) { + let IncrementalState::Sliding { expr, accumulator } = state else { + continue; + }; + let state = accumulator.state().unwrap_or_else(|_| { + let state = accumulator.state().unwrap(); + *accumulator = expr.create_sliding_accumulator().unwrap(); + let states: Vec<_> = + state.iter().map(|s| s.to_array()).try_collect().unwrap(); + accumulator.merge_batch(&states).unwrap(); + state + }); + + for (idx, v) in agg.state_cols.iter().zip(state.into_iter()) { + states[*idx].push(v); + } + } + parser.parse(k.0.as_ref()) + }))?; + + cols.extend( + states + .into_iter() + .skip(cols.len()) + .map(|c| ScalarValue::iter_to_array(c).unwrap()), + ); + + let generations = generation_builder.finish(); + self.new_generation = self + .new_generation + .max(max_array::(&generations).unwrap()); + cols.push(Arc::new(generations)); + + Ok(Some(cols)) + } + + fn checkpoint_batch(&mut self) -> DFResult>> { + if self + .aggregates + .iter() + .all(|agg| agg.accumulator_type == AccumulatorType::Sliding) + { + return Ok(None); + } + if self.updated_keys.is_empty() { + return Ok(None); + } + + let size = self.updated_keys.len(); + let mut rows = Vec::with_capacity(size); + let mut accumulator_builder = UInt32Builder::with_capacity(size); + let mut args_row_builder = BinaryBuilder::with_capacity(size, size * 4); + let mut count_builder = UInt64Builder::with_capacity(size); + let mut timestamp_builder = TimestampNanosecondBuilder::with_capacity(size); + let mut generation_builder = UInt64Builder::with_capacity(size); + + let now = to_nanos(SystemTime::now()) as i64; + let parser = self.key_converter.parser(); + + for k in self.updated_keys.keys() { + let row = parser.parse(&k.0); + for (i, state) in self + .accumulators + .get_mut(k.0.as_ref()) + .unwrap() + .iter_mut() + .enumerate() + { + let IncrementalState::Batch { + data, + changed_values, + .. + } = state + else { + continue; + }; + + for vk in changed_values.iter() { + if let Some(count) = data.get(vk) { + accumulator_builder.append_value(i as u32); + args_row_builder.append_value(&*vk.0); + count_builder.append_value(count.count); + generation_builder.append_value(count.generation); + timestamp_builder.append_value(now); + rows.push(row.to_owned()) + } + } + data.retain(|_, v| v.count > 0); + } + } + + let mut cols = self.key_converter.convert_rows(rows.into_iter())?; + cols.push(Arc::new(accumulator_builder.finish())); + cols.push(Arc::new(args_row_builder.finish())); + cols.push(Arc::new(count_builder.finish())); + cols.push(Arc::new(timestamp_builder.finish())); + + let generations = generation_builder.finish(); + self.new_generation = self + .new_generation + .max(max_array::(&generations).unwrap()); + cols.push(Arc::new(generations)); + + Ok(Some(cols)) + } + + fn restore_sliding( + &mut self, + key: &[u8], + now: Instant, + i: usize, + aggregate_states: &Vec>, + generation: u64, + ) -> Result<()> { + let mut accumulators = self.make_accumulators(); + for ((_, state_cols), acc) in self + .aggregates + .iter() + .zip(aggregate_states.iter()) + .zip(accumulators.iter_mut()) + { + if let IncrementalState::Sliding { accumulator, .. } = acc { + accumulator.merge_batch(&state_cols.iter().map(|c| c.slice(i, 1)).collect_vec())? + } + } + self.accumulators + .insert(Arc::new(key.to_vec()), now, generation, accumulators); + Ok(()) + } + + async fn initialize(&mut self, _ctx: &mut TaskContext) -> Result<()> { + let mut deleted_keys = vec![]; + for (k, v) in self.accumulators.iter_mut() { + let is_deleted = v.last_mut().unwrap().evaluate()?.is_null(); + if is_deleted { + deleted_keys.push(k.clone()); + } else { + for is in v { + if let IncrementalState::Batch { data, .. } = is { + data.retain(|_, v| v.count > 0); + } + } + } + } + for k in deleted_keys { + self.accumulators.remove(&k.0); + } + Ok(()) + } + + fn generate_changelog(&mut self) -> Result> { + let mut output_keys = Vec::with_capacity(self.updated_keys.len() * 2); + let mut output_values = + vec![Vec::with_capacity(self.updated_keys.len() * 2); self.aggregates.len()]; + let mut is_retracts = Vec::with_capacity(self.updated_keys.len() * 2); + + let (updated_keys, updated_values): (Vec<_>, Vec<_>) = + mem::take(&mut self.updated_keys).into_iter().unzip(); + let mut deleted_keys = vec![]; + + for (k, retract) in updated_keys.iter().zip(updated_values.into_iter()) { + let append = self.evaluate(&k.0)?; + + if let Some(v) = retract { + if v.iter() + .zip(append.iter()) + .take(v.len() - 1) + .all(|(a, b)| a == b) + { + continue; + } + is_retracts.push(true); + output_keys.push(k.clone()); + for (out, val) in output_values.iter_mut().zip(v) { + out.push(val); + } + } + + if !append.last().unwrap().is_null() { + is_retracts.push(false); + output_keys.push(k.clone()); + for (out, val) in output_values.iter_mut().zip(append) { + out.push(val); + } + } else { + deleted_keys.push(k); + } + } + + for k in deleted_keys { + self.accumulators.remove(&k.0); + } + + let mut ttld_keys = vec![]; + for (k, mut v) in self.accumulators.time_out(Instant::now()) { + is_retracts.push(true); + ttld_keys.push(k); + for (out, val) in output_values + .iter_mut() + .zip(v.iter_mut().map(|s| s.evaluate())) + { + out.push(val?); + } + } + + if output_keys.is_empty() && ttld_keys.is_empty() { + return Ok(None); + } + + let row_parser = self.key_converter.parser(); + let mut result_cols = self.key_converter.convert_rows( + output_keys + .iter() + .map(|k| row_parser.parse(k.0.as_slice())) + .chain(ttld_keys.iter().map(|k| row_parser.parse(k.as_slice()))), + )?; + + for acc in output_values.into_iter() { + result_cols.push(ScalarValue::iter_to_array(acc).unwrap()); + } + + let record_batch = + RecordBatch::try_new(self.schema_without_metadata.clone(), result_cols).unwrap(); + + let metadata = self + .metadata_expr + .evaluate(&record_batch) + .unwrap() + .into_array(record_batch.num_rows()) + .unwrap(); + let metadata = set_retract_metadata(metadata, Arc::new(BooleanArray::from(is_retracts))); + + let mut final_batch = record_batch.columns().to_vec(); + final_batch.push(metadata); + + Ok(Some(RecordBatch::try_new( + self.final_output_schema.clone(), + final_batch, + )?)) + } +} + +fn set_retract_metadata(metadata: ArrayRef, is_retract: Arc) -> ArrayRef { + let metadata = metadata.as_struct(); + let arrays: Vec> = vec![is_retract, metadata.column(1).clone()]; + Arc::new(StructArray::new(updating_meta_fields(), arrays, None)) +} + +// ========================================================================= +// ========================================================================= + +#[async_trait::async_trait] +impl Operator for IncrementalAggregatingFunc { + fn name(&self) -> &str { + "UpdatingAggregatingFunc" + } + + async fn on_start(&mut self, ctx: &mut TaskContext) -> Result<()> { + self.initialize(ctx).await?; + Ok(()) + } + + async fn process_data( + &mut self, + _input_idx: usize, + batch: RecordBatch, + _ctx: &mut TaskContext, + ) -> Result> { + if self.has_routing_keys { + self.keyed_aggregate(&batch)?; + } else { + self.global_aggregate(&batch)?; + } + + Ok(vec![]) + } + + async fn process_watermark( + &mut self, + _watermark: Watermark, + _ctx: &mut TaskContext, + ) -> Result> { + if let Some(changelog_batch) = self.generate_changelog()? { + Ok(vec![StreamOutput::Forward(changelog_batch)]) + } else { + Ok(vec![]) + } + } + + async fn snapshot_state( + &mut self, + _barrier: CheckpointBarrier, + _ctx: &mut TaskContext, + ) -> Result<()> { + Ok(()) + } + + async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result> { + Ok(vec![]) + } +} + +// ========================================================================= +// ========================================================================= + +pub struct IncrementalAggregatingConstructor; + +impl IncrementalAggregatingConstructor { + pub fn with_config( + &self, + config: UpdatingAggregateOperator, + registry: Arc, + ) -> anyhow::Result { + let ttl = Duration::from_micros(if config.ttl_micros == 0 { + warn!("ttl was not set for updating aggregate"); + 24 * 60 * 60 * 1000 * 1000 + } else { + config.ttl_micros + }); + + let input_schema: FsSchema = config.input_schema.unwrap().try_into()?; + let final_schema: FsSchema = config.final_schema.unwrap().try_into()?; + let mut schema_without_metadata = SchemaBuilder::from((*final_schema.schema).clone()); + schema_without_metadata.remove(final_schema.schema.index_of(UPDATING_META_FIELD).unwrap()); + + let metadata_expr = parse_physical_expr( + &PhysicalExprNode::decode(&mut config.metadata_expr.as_slice())?, + registry.as_ref(), + &input_schema.schema, + &DefaultPhysicalExtensionCodec {}, + )?; + + let aggregate_exec = PhysicalPlanNode::decode(&mut config.aggregate_exec.as_ref())?; + let PhysicalPlanType::Aggregate(aggregate_exec) = + aggregate_exec.physical_plan_type.unwrap() + else { + bail!("invalid proto"); + }; + + let mut sliding_state_fields = input_schema + .routing_keys() + .map(|v| { + v.iter() + .map(|idx| input_schema.schema.field(*idx).clone()) + .collect_vec() + }) + .unwrap_or_default(); + + let has_routing_keys = input_schema.routing_keys().is_some(); + let mut batch_state_fields = sliding_state_fields.clone(); + let key_fields = (0..sliding_state_fields.len()).collect_vec(); + + let aggregates: Vec<_> = aggregate_exec + .aggr_expr + .iter() + .zip(aggregate_exec.aggr_expr_name.iter()) + .map(|(expr, name)| { + Ok(decode_aggregate( + &input_schema.schema, + name, + expr, + registry.as_ref(), + )?) + }) + .map_ok(|agg| { + let retract = match agg.create_sliding_accumulator() { + Ok(s) => s.supports_retract_batch(), + _ => false, + }; + ( + agg, + if retract { + AccumulatorType::Sliding + } else { + AccumulatorType::Batch + }, + ) + }) + .map_ok(|(agg, t)| { + let row_converter = Arc::new(RowConverter::new( + agg.expressions() + .iter() + .map(|ex| Ok(SortField::new(ex.data_type(&input_schema.schema)?))) + .collect::>()?, + )?); + let fields = t.state_fields(&agg)?; + let field_names = fields.iter().map(|f| f.name().to_string()).collect_vec(); + sliding_state_fields.extend(fields.into_iter().map(|f| (*f).clone())); + Ok::<_, anyhow::Error>((agg, t, row_converter, field_names)) + }) + .flatten_ok() + .collect::>()?; + + let state_schema = Schema::new(sliding_state_fields); + + let aggregates = aggregates + .into_iter() + .map(|(agg, t, row_converter, field_names)| Aggregator { + func: agg, + accumulator_type: t, + row_converter, + state_cols: field_names + .iter() + .map(|f| state_schema.index_of(f).unwrap()) + .collect(), + }) + .collect(); + + let mut state_fields = state_schema.fields().to_vec(); + let timestamp_field = state_fields.pop().unwrap(); + state_fields.push(Arc::new( + (*timestamp_field).clone().with_name(TIMESTAMP_FIELD), + )); + + let sliding_state_schema = Arc::new(FsSchema::from_schema_keys( + Arc::new(Schema::new(state_fields)), + key_fields.clone(), + )?); + + batch_state_fields.push(Field::new("accumulator", DataType::UInt32, false)); + batch_state_fields.push(Field::new("args_row", DataType::Binary, false)); + batch_state_fields.push(Field::new("count", DataType::UInt64, false)); + batch_state_fields.push(Field::new( + TIMESTAMP_FIELD, + DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + )); + let timestamp_index = batch_state_fields.len() - 1; + + let mut storage_key_fields = key_fields.clone(); + storage_key_fields.push(storage_key_fields.len()); + storage_key_fields.push(storage_key_fields.len()); + + let batch_state_schema = Arc::new(FsSchema::new( + Arc::new(Schema::new(batch_state_fields)), + timestamp_index, + Some(storage_key_fields), + Some(key_fields), + )); + + Ok(IncrementalAggregatingFunc { + flush_interval: Duration::from_micros(config.flush_interval_micros), + metadata_expr, + ttl, + aggregates, + accumulators: UpdatingCache::with_time_to_idle(ttl), + schema_without_metadata: Arc::new(schema_without_metadata.finish()), + final_output_schema: final_schema.schema.clone(), + updated_keys: Default::default(), + input_schema: Arc::new(input_schema.clone()), + has_routing_keys, + key_converter: RowConverter::new(input_schema.sort_fields(false))?, + sliding_state_schema, + batch_state_schema, + new_generation: 0, + }) + } +} diff --git a/src/runtime/streaming/operators/grouping/mod.rs b/src/runtime/streaming/operators/grouping/mod.rs new file mode 100644 index 00000000..2a17a49d --- /dev/null +++ b/src/runtime/streaming/operators/grouping/mod.rs @@ -0,0 +1,17 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod incremental_aggregate; +pub mod updating_cache; + +pub use incremental_aggregate::IncrementalAggregatingConstructor; +pub use updating_cache::{Key, UpdatingCache}; diff --git a/src/runtime/streaming/operators/grouping/updating_cache.rs b/src/runtime/streaming/operators/grouping/updating_cache.rs new file mode 100644 index 00000000..37f2ba04 --- /dev/null +++ b/src/runtime/streaming/operators/grouping/updating_cache.rs @@ -0,0 +1,507 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::borrow::Borrow; +use std::collections::HashMap; +use std::sync::Arc; +use std::time::{Duration, Instant}; + +#[derive(Hash, Eq, PartialEq, Clone, Debug)] +pub struct Key(pub Arc>); + +impl Borrow<[u8]> for Key { + fn borrow(&self) -> &[u8] { + &self.0 + } +} + +struct Node { + key: Key, + data: Option, + generation: u64, + updated: Instant, + prev: Option, + next: Option, +} + +pub struct UpdatingCache { + map: HashMap, + nodes: Vec>, + free_list: Vec, + head: Option, + tail: Option, + ttl: Duration, +} + +struct TTLIter<'a, T: Send + Sync> { + now: Instant, + cache: &'a mut UpdatingCache, +} + +impl Iterator for TTLIter<'_, T> { + type Item = (Arc>, T); + + fn next(&mut self) -> Option { + let head_idx = self.cache.head?; + let node = &self.cache.nodes[head_idx]; + + if self.now.saturating_duration_since(node.updated) < self.cache.ttl { + return None; + } + + let (k, v) = self.cache.pop_front()?; + Some((k.0, v)) + } +} + +impl UpdatingCache { + pub fn with_time_to_idle(ttl: Duration) -> Self { + Self { + map: HashMap::new(), + nodes: Vec::new(), + free_list: Vec::new(), + head: None, + tail: None, + ttl, + } + } + + pub fn insert(&mut self, key: Arc>, now: Instant, generation: u64, value: T) { + let key_obj = Key(key); + + if let Some(&idx) = self.map.get(&key_obj) { + if self.nodes[idx].generation >= generation { + return; + } + self.nodes[idx].data = Some(value); + self.nodes[idx].generation = generation; + self.nodes[idx].updated = now; + self.move_to_tail(idx); + return; + } + + let idx = self.allocate_node(key_obj.clone(), value, generation, now); + self.map.insert(key_obj, idx); + self.push_back(idx); + } + + pub fn time_out(&mut self, now: Instant) -> impl Iterator>, T)> + '_ { + TTLIter { now, cache: self } + } + + pub fn iter_mut(&mut self) -> impl Iterator { + self.nodes.iter_mut().filter_map(|n| { + if let Some(data) = &mut n.data { + Some((&n.key, data)) + } else { + None + } + }) + } + + pub fn modify_and_update Result<(), E>>( + &mut self, + key: &[u8], + now: Instant, + f: F, + ) -> Option> { + let &idx = self.map.get(key)?; + let node = &mut self.nodes[idx]; + + if let Err(e) = f(node.data.as_mut().unwrap()) { + return Some(Err(e)); + } + + node.generation += 1; + node.updated = now; + self.move_to_tail(idx); + + Some(Ok(())) + } + + pub fn modify Result<(), E>>( + &mut self, + key: &[u8], + f: F, + ) -> Option> { + let &idx = self.map.get(key)?; + let node = &mut self.nodes[idx]; + + node.generation += 1; + + if let Err(e) = f(node.data.as_mut().unwrap()) { + return Some(Err(e)); + } + + Some(Ok(())) + } + + pub fn contains_key(&self, k: &[u8]) -> bool { + self.map.contains_key(k) + } + + pub fn get_mut(&mut self, key: &[u8]) -> Option<&mut T> { + let &idx = self.map.get(key)?; + self.nodes[idx].data.as_mut() + } + + pub fn get_mut_generation(&mut self, key: &[u8]) -> Option<(&mut T, u64)> { + let &idx = self.map.get(key)?; + let node = &mut self.nodes[idx]; + Some((node.data.as_mut().unwrap(), node.generation)) + } + + pub fn get_mut_key_value(&mut self, key: &[u8]) -> Option<(Key, &mut T)> { + let &idx = self.map.get(key)?; + let node = &mut self.nodes[idx]; + Some((node.key.clone(), node.data.as_mut().unwrap())) + } + + pub fn remove(&mut self, key: &[u8]) -> Option { + let &idx = self.map.get(key)?; + self.map.remove(key); + self.remove_node(idx); + + let data = self.nodes[idx].data.take().unwrap(); + self.free_list.push(idx); + + Some(data) + } + + fn pop_front(&mut self) -> Option<(Key, T)> { + let head_idx = self.head?; + self.remove_node(head_idx); + + let node = &mut self.nodes[head_idx]; + self.map.remove(&node.key); + + let key = node.key.clone(); + let data = node.data.take().unwrap(); + self.free_list.push(head_idx); + + Some((key, data)) + } + + fn allocate_node(&mut self, key: Key, data: T, generation: u64, updated: Instant) -> usize { + let new_node = Node { + key, + data: Some(data), + generation, + updated, + prev: None, + next: None, + }; + + if let Some(idx) = self.free_list.pop() { + self.nodes[idx] = new_node; + idx + } else { + let idx = self.nodes.len(); + self.nodes.push(new_node); + idx + } + } + + fn push_back(&mut self, index: usize) { + self.nodes[index].prev = self.tail; + self.nodes[index].next = None; + + if let Some(tail_idx) = self.tail { + self.nodes[tail_idx].next = Some(index); + } else { + self.head = Some(index); + } + self.tail = Some(index); + } + + fn remove_node(&mut self, index: usize) { + let prev = self.nodes[index].prev; + let next = self.nodes[index].next; + + if let Some(p) = prev { + self.nodes[p].next = next; + } else { + self.head = next; + } + + if let Some(n) = next { + self.nodes[n].prev = prev; + } else { + self.tail = prev; + } + + self.nodes[index].prev = None; + self.nodes[index].next = None; + } + + fn move_to_tail(&mut self, index: usize) { + if self.tail == Some(index) { + return; + } + self.remove_node(index); + self.push_back(index); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_insert_and_modify() { + let mut cache = UpdatingCache::with_time_to_idle(Duration::from_secs(60)); + + let key = Arc::new(vec![1, 2, 3]); + let now = Instant::now(); + cache.insert(key.clone(), now, 1, 42); + + assert!( + cache + .modify(key.as_ref(), |x| { + *x = 43; + Ok::<(), ()>(()) + }) + .unwrap() + .is_ok() + ); + + assert_eq!(*cache.get_mut(key.as_ref()).unwrap(), 43); + } + + #[test] + fn test_timeout() { + let mut cache = UpdatingCache::with_time_to_idle(Duration::from_millis(10)); + + let key1 = Arc::new(vec![1]); + let key2 = Arc::new(vec![2]); + + let start = Instant::now(); + cache.insert(key1.clone(), start, 1, "value1"); + cache.insert(key2.clone(), start + Duration::from_millis(5), 2, "value2"); + + let check_time = start + Duration::from_millis(11); + let timed_out: Vec<_> = cache.time_out(check_time).collect(); + assert_eq!(timed_out.len(), 1); + assert_eq!(&*timed_out[0].0, &*key1); + + assert!(cache.contains_key(key2.as_ref())); + assert!(!cache.contains_key(key1.as_ref())); + } + + #[test] + fn test_update_keeps_alive() { + let mut cache = UpdatingCache::with_time_to_idle(Duration::from_millis(10)); + + let key = Arc::new(vec![1]); + let start = Instant::now(); + cache.insert(key.clone(), start, 1, "value"); + + let update_time = start + Duration::from_millis(5); + cache + .modify_and_update(key.as_ref(), update_time, |_| Ok::<(), ()>(())) + .unwrap() + .unwrap(); + + let check_time = start + Duration::from_millis(11); + let timed_out: Vec<_> = cache.time_out(check_time).collect(); + assert!(timed_out.is_empty()); + assert!(cache.contains_key(key.as_ref())); + } + + #[test] + fn test_lru_eviction_order_matches_insertion() { + let mut cache = UpdatingCache::with_time_to_idle(Duration::from_secs(60)); + let key1 = Arc::new(vec![1]); + let key2 = Arc::new(vec![2]); + let key3 = Arc::new(vec![3]); + let now = Instant::now(); + cache.insert(key1.clone(), now, 1, 1); + cache.insert(key2.clone(), now, 2, 2); + cache.insert(key3.clone(), now, 3, 3); + + let evicted: Vec<_> = cache.time_out(now + Duration::from_secs(61)).collect(); + assert_eq!(evicted.len(), 3); + assert_eq!(evicted[0].0.as_ref(), &*key1); + assert_eq!(evicted[1].0.as_ref(), &*key2); + assert_eq!(evicted[2].0.as_ref(), &*key3); + } + + #[test] + fn test_remove_middle_key() { + let mut cache = UpdatingCache::with_time_to_idle(Duration::from_secs(60)); + let key1 = Arc::new(vec![1]); + let key2 = Arc::new(vec![2]); + let key3 = Arc::new(vec![3]); + let now = Instant::now(); + cache.insert(key1.clone(), now, 1, 1); + cache.insert(key2.clone(), now, 2, 2); + cache.insert(key3.clone(), now, 3, 3); + + assert_eq!(cache.remove(&[2]).unwrap(), 2); + assert!(cache.contains_key(&[1])); + assert!(!cache.contains_key(&[2])); + assert!(cache.contains_key(&[3])); + + let evicted: Vec<_> = cache.time_out(now + Duration::from_secs(61)).collect(); + assert_eq!(evicted.len(), 2); + assert_eq!(evicted[0].0.as_ref(), &*key1); + assert_eq!(evicted[1].0.as_ref(), &*key3); + } + + #[test] + fn reorder_with_update() { + let mut cache = UpdatingCache::::with_time_to_idle(Duration::from_secs(10)); + let key1 = Arc::new(vec![1]); + let key2 = Arc::new(vec![2]); + let now = Instant::now(); + + cache.insert(key1.clone(), now, 1, 100); + cache.insert(key2.clone(), now, 2, 200); + + cache + .modify_and_update(&[1], now + Duration::from_secs(1), |v| { + *v += 1; + Ok::<(), ()>(()) + }) + .unwrap() + .unwrap(); + + let _ = cache.modify_and_update(&[1], now + Duration::from_secs(2), |v| { + *v += 1; + Ok::<(), ()>(()) + }); + } + + #[test] + fn test_ttl_eviction() { + let ttl = Duration::from_millis(100); + let mut cache = UpdatingCache::with_time_to_idle(ttl); + let now = Instant::now(); + let key1 = Arc::new(vec![1]); + let key2 = Arc::new(vec![2]); + cache.insert(key1.clone(), now, 1, 10); + cache.insert(key2.clone(), now, 2, 20); + + cache + .modify_and_update(&[2], now + Duration::from_millis(50), |v| { + *v += 1; + Ok::<(), ()>(()) + }) + .unwrap() + .unwrap(); + + let now2 = now + Duration::from_millis(150); + let evicted: Vec<_> = cache.time_out(now2).collect(); + assert_eq!(evicted.len(), 2); + assert_eq!(evicted[0].0.as_ref(), &[1]); + assert_eq!(evicted[1].0.as_ref(), &[2]); + } + + #[test] + fn test_remove_key() { + let ttl = Duration::from_millis(100); + let mut cache = UpdatingCache::with_time_to_idle(ttl); + let now = Instant::now(); + let key = Arc::new(vec![1]); + cache.insert(key.clone(), now, 1, 42); + let value = cache.remove(&[1]).unwrap(); + assert_eq!(value, 42); + assert!(!cache.contains_key(&[1])); + let evicted: Vec<_> = cache.time_out(now + Duration::from_millis(200)).collect(); + assert!(evicted.is_empty()); + } + + #[test] + fn test_update_order() { + let ttl = Duration::from_secs(1); + let mut cache = UpdatingCache::with_time_to_idle(ttl); + let base = Instant::now(); + let key_a = Arc::new(vec![b'A']); + let key_b = Arc::new(vec![b'B']); + let key_c = Arc::new(vec![b'C']); + cache.insert(key_a.clone(), base, 1, 1); + cache.insert(key_b.clone(), base, 2, 2); + cache.insert(key_c.clone(), base, 3, 3); + + let t_update = base + Duration::from_millis(500); + cache + .modify_and_update(b"B", t_update, |v| { + *v += 10; + Ok::<(), ()>(()) + }) + .unwrap() + .unwrap(); + + let t_eviction = base + Duration::from_secs(2); + let evicted: Vec<_> = cache.time_out(t_eviction).collect(); + assert_eq!(evicted.len(), 3); + assert_eq!(evicted[0].0.as_ref(), b"A"); + assert_eq!(evicted[1].0.as_ref(), b"C"); + assert_eq!(evicted[2].0.as_ref(), b"B"); + } + + #[test] + fn test_get_mut_key_value() { + let ttl = Duration::from_secs(1); + let mut cache = UpdatingCache::with_time_to_idle(ttl); + let base = Instant::now(); + let key = Arc::new(vec![1, 2, 3]); + cache.insert(key.clone(), base, 1, 42); + if let Some((k, v)) = cache.get_mut_key_value(&[1, 2, 3]) { + *v += 1; + assert_eq!(*v, 43); + assert_eq!(k.0.as_ref(), &[1, 2, 3]); + } else { + panic!("Key not found"); + } + } + + #[test] + fn test_modify_error() { + let ttl = Duration::from_secs(1); + let mut cache = UpdatingCache::with_time_to_idle(ttl); + let base = Instant::now(); + let key = Arc::new(vec![1]); + cache.insert(key.clone(), base, 1, 42); + let res = cache.modify(&[1], |_v| Err("error")); + assert!(res.unwrap().is_err()); + } + + #[test] + fn test_drop_cleanup() { + let ttl = Duration::from_secs(1); + { + let mut cache = UpdatingCache::with_time_to_idle(ttl); + let base = Instant::now(); + for i in 0..10 { + cache.insert(Arc::new(vec![i as u8]), base, i as u64, i); + } + } + } + + #[test] + fn test_generational_replacement() { + let ttl = Duration::from_secs(1); + let mut cache = UpdatingCache::with_time_to_idle(ttl); + let base = Instant::now(); + let key = Arc::new(vec![1]); + + cache.insert(key.clone(), base, 1, "first"); + assert_eq!(cache.get_mut(&[1]), Some(&mut "first")); + + cache.insert(key.clone(), base, 2, "second"); + assert_eq!(cache.get_mut(&[1]), Some(&mut "second")); + + cache.insert(key.clone(), base, 1, "third"); + assert_eq!(cache.get_mut(&[1]), Some(&mut "second")); + } +} diff --git a/src/runtime/streaming/operators/joins/join_instance.rs b/src/runtime/streaming/operators/joins/join_instance.rs new file mode 100644 index 00000000..75513542 --- /dev/null +++ b/src/runtime/streaming/operators/joins/join_instance.rs @@ -0,0 +1,301 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{Result, anyhow}; +use arrow::compute::{max, min, partition, sort_to_indices, take}; +use arrow_array::{RecordBatch, TimestampNanosecondArray}; +use datafusion::execution::SendableRecordBatchStream; +use datafusion::execution::context::SessionContext; +use datafusion::execution::runtime_env::RuntimeEnvBuilder; +use datafusion::physical_plan::ExecutionPlan; +use datafusion_proto::physical_plan::AsExecutionPlan; +use datafusion_proto::protobuf::PhysicalPlanNode; +use futures::StreamExt; +use prost::Message; +use std::collections::BTreeMap; +use std::sync::{Arc, RwLock}; +use std::time::SystemTime; +use tokio::sync::mpsc::{UnboundedReceiver, UnboundedSender, unbounded_channel}; +use tracing::warn; + +use crate::runtime::streaming::StreamOutput; +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::Operator; +use crate::runtime::streaming::factory::Registry; +use crate::sql::common::constants::mem_exec_join_side; +use crate::sql::common::{CheckpointBarrier, FsSchema, FsSchemaRef, Watermark, from_nanos}; +use crate::sql::physical::{StreamingDecodingContext, StreamingExtensionCodec}; +use async_trait::async_trait; +use protocol::function_stream_graph::JoinOperator; + +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +enum JoinSide { + Left, + Right, +} + +impl JoinSide { + #[allow(dead_code)] + fn name(&self) -> &'static str { + match self { + JoinSide::Left => mem_exec_join_side::LEFT, + JoinSide::Right => mem_exec_join_side::RIGHT, + } + } +} + +struct JoinInstance { + left_tx: UnboundedSender, + right_tx: UnboundedSender, + result_stream: SendableRecordBatchStream, +} + +impl JoinInstance { + fn feed_data(&self, batch: RecordBatch, side: JoinSide) -> Result<()> { + match side { + JoinSide::Left => self + .left_tx + .send(batch) + .map_err(|e| anyhow!("Left send err: {}", e)), + JoinSide::Right => self + .right_tx + .send(batch) + .map_err(|e| anyhow!("Right send err: {}", e)), + } + } + + async fn close_and_drain(self) -> Result> { + drop(self.left_tx); + drop(self.right_tx); + + let mut outputs = Vec::new(); + let mut stream = self.result_stream; + + while let Some(result_batch) = stream.next().await { + outputs.push(result_batch?); + } + + Ok(outputs) + } +} + +pub struct InstantJoinOperator { + left_input_schema: FsSchemaRef, + right_input_schema: FsSchemaRef, + active_joins: BTreeMap, + left_receiver_hook: Arc>>>, + right_receiver_hook: Arc>>>, + join_exec_plan: Arc, +} + +impl InstantJoinOperator { + fn input_schema(&self, side: JoinSide) -> FsSchemaRef { + match side { + JoinSide::Left => self.left_input_schema.clone(), + JoinSide::Right => self.right_input_schema.clone(), + } + } + + fn get_or_create_join_instance(&mut self, time: SystemTime) -> Result<&mut JoinInstance> { + use std::collections::btree_map::Entry; + + if let Entry::Vacant(e) = self.active_joins.entry(time) { + let (left_tx, left_rx) = unbounded_channel(); + let (right_tx, right_rx) = unbounded_channel(); + + *self.left_receiver_hook.write().unwrap() = Some(left_rx); + *self.right_receiver_hook.write().unwrap() = Some(right_rx); + + self.join_exec_plan.reset().map_err(|e| anyhow!("{e}"))?; + let result_stream = self + .join_exec_plan + .execute(0, SessionContext::new().task_ctx()) + .map_err(|e| anyhow!("{e}"))?; + + e.insert(JoinInstance { + left_tx, + right_tx, + result_stream, + }); + } + + self.active_joins + .get_mut(&time) + .ok_or_else(|| anyhow!("join instance missing after insert")) + } + + async fn process_side_internal( + &mut self, + side: JoinSide, + batch: RecordBatch, + ctx: &mut TaskContext, + ) -> Result<()> { + if batch.num_rows() == 0 { + return Ok(()); + } + + let time_column = batch + .column(self.input_schema(side).timestamp_index) + .as_any() + .downcast_ref::() + .ok_or_else(|| anyhow!("Missing timestamp column"))?; + + let min_timestamp = min(time_column).ok_or_else(|| anyhow!("empty timestamp column"))?; + let max_timestamp = max(time_column).ok_or_else(|| anyhow!("empty timestamp column"))?; + + if let Some(watermark) = ctx.current_watermark() + && watermark > from_nanos(min_timestamp as u128) + { + warn!("Dropped late batch from {:?} before watermark", side); + return Ok(()); + } + + let unkeyed_batch = self.input_schema(side).unkeyed_batch(&batch)?; + + if max_timestamp == min_timestamp { + let time_key = from_nanos(max_timestamp as u128); + let join_instance = self.get_or_create_join_instance(time_key)?; + join_instance.feed_data(unkeyed_batch, side)?; + return Ok(()); + } + + let indices = sort_to_indices(time_column, None, None)?; + let columns: Vec<_> = unkeyed_batch + .columns() + .iter() + .map(|c| take(c, &indices, None).unwrap()) + .collect(); + let sorted_batch = RecordBatch::try_new(unkeyed_batch.schema(), columns)?; + let sorted_timestamps = take(time_column, &indices, None).unwrap(); + let typed_timestamps = sorted_timestamps + .as_any() + .downcast_ref::() + .ok_or_else(|| anyhow!("sorted timestamps downcast failed"))?; + let ranges = partition(std::slice::from_ref(&sorted_timestamps)) + .unwrap() + .ranges(); + + for range in ranges { + let sub_batch = sorted_batch.slice(range.start, range.end - range.start); + let time_key = from_nanos(typed_timestamps.value(range.start) as u128); + let join_instance = self.get_or_create_join_instance(time_key)?; + join_instance.feed_data(sub_batch, side)?; + } + + Ok(()) + } +} + +#[async_trait] +impl Operator for InstantJoinOperator { + fn name(&self) -> &str { + "InstantJoin" + } + + async fn on_start(&mut self, _ctx: &mut TaskContext) -> Result<()> { + Ok(()) + } + + async fn process_data( + &mut self, + input_idx: usize, + batch: RecordBatch, + ctx: &mut TaskContext, + ) -> Result> { + let side = if input_idx == 0 { + JoinSide::Left + } else { + JoinSide::Right + }; + self.process_side_internal(side, batch, ctx).await?; + Ok(vec![]) + } + + async fn process_watermark( + &mut self, + watermark: Watermark, + _ctx: &mut TaskContext, + ) -> Result> { + let Watermark::EventTime(current_time) = watermark else { + return Ok(vec![]); + }; + let mut emit_outputs = Vec::new(); + + let mut expired_times = Vec::new(); + for key in self.active_joins.keys() { + if *key < current_time { + expired_times.push(*key); + } else { + break; + } + } + + for time_key in expired_times { + if let Some(join_instance) = self.active_joins.remove(&time_key) { + let joined_batches = join_instance.close_and_drain().await?; + for batch in joined_batches { + emit_outputs.push(StreamOutput::Forward(batch)); + } + } + } + + Ok(emit_outputs) + } + + async fn snapshot_state( + &mut self, + _barrier: CheckpointBarrier, + _ctx: &mut TaskContext, + ) -> Result<()> { + Ok(()) + } +} + +pub struct InstantJoinConstructor; + +impl InstantJoinConstructor { + pub fn with_config( + &self, + config: JoinOperator, + registry: Arc, + ) -> anyhow::Result { + let join_physical_plan_node = PhysicalPlanNode::decode(&mut config.join_plan.as_slice())?; + + let left_input_schema: Arc = Arc::new(config.left_schema.unwrap().try_into()?); + let right_input_schema: Arc = Arc::new(config.right_schema.unwrap().try_into()?); + + let left_receiver_hook = Arc::new(RwLock::new(None)); + let right_receiver_hook = Arc::new(RwLock::new(None)); + + let codec = StreamingExtensionCodec { + context: StreamingDecodingContext::LockedJoinStream { + left: left_receiver_hook.clone(), + right: right_receiver_hook.clone(), + }, + }; + + let join_exec_plan = join_physical_plan_node.try_into_physical_plan( + registry.as_ref(), + &RuntimeEnvBuilder::new().build()?, + &codec, + )?; + + Ok(InstantJoinOperator { + left_input_schema, + right_input_schema, + active_joins: BTreeMap::new(), + left_receiver_hook, + right_receiver_hook, + join_exec_plan, + }) + } +} diff --git a/src/runtime/streaming/operators/joins/join_with_expiration.rs b/src/runtime/streaming/operators/joins/join_with_expiration.rs new file mode 100644 index 00000000..60bbe7e3 --- /dev/null +++ b/src/runtime/streaming/operators/joins/join_with_expiration.rs @@ -0,0 +1,280 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{Result, anyhow}; +use arrow::compute::concat_batches; +use arrow_array::RecordBatch; +use datafusion::execution::context::SessionContext; +use datafusion::execution::runtime_env::RuntimeEnvBuilder; +use datafusion::physical_plan::ExecutionPlan; +use datafusion_proto::{physical_plan::AsExecutionPlan, protobuf::PhysicalPlanNode}; +use futures::StreamExt; +use prost::Message; +use std::collections::VecDeque; +use std::sync::{Arc, RwLock}; +use std::time::{Duration, SystemTime}; +use tracing::warn; + +use crate::runtime::streaming::StreamOutput; +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::Operator; +use crate::runtime::streaming::factory::Registry; +use crate::sql::common::{CheckpointBarrier, FsSchema, Watermark}; +use crate::sql::physical::{StreamingDecodingContext, StreamingExtensionCodec}; +use async_trait::async_trait; +use protocol::function_stream_graph::JoinOperator; + +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +enum JoinSide { + Left, + Right, +} + +// ============================================================================ +// ============================================================================ + +struct StateBuffer { + batches: VecDeque<(SystemTime, RecordBatch)>, + ttl: Duration, +} + +impl StateBuffer { + fn new(ttl: Duration) -> Self { + Self { + batches: VecDeque::new(), + ttl, + } + } + + fn insert(&mut self, batch: RecordBatch, time: SystemTime) { + self.batches.push_back((time, batch)); + } + + fn expire(&mut self, current_time: SystemTime) { + let cutoff = current_time + .checked_sub(self.ttl) + .unwrap_or(SystemTime::UNIX_EPOCH); + while let Some((time, _)) = self.batches.front() { + if *time < cutoff { + self.batches.pop_front(); + } else { + break; + } + } + } + + fn get_all_batches(&self) -> Vec { + self.batches.iter().map(|(_, b)| b.clone()).collect() + } +} + +// ============================================================================ +// ============================================================================ + +pub struct JoinWithExpirationOperator { + left_input_schema: FsSchema, + right_input_schema: FsSchema, + left_schema: FsSchema, + right_schema: FsSchema, + + left_passer: Arc>>, + right_passer: Arc>>, + join_exec_plan: Arc, + + left_state: StateBuffer, + right_state: StateBuffer, +} + +impl JoinWithExpirationOperator { + async fn compute_pair( + &mut self, + left: RecordBatch, + right: RecordBatch, + ) -> Result> { + if left.num_rows() == 0 || right.num_rows() == 0 { + return Ok(vec![]); + } + + { + self.left_passer.write().unwrap().replace(left); + self.right_passer.write().unwrap().replace(right); + } + + self.join_exec_plan + .reset() + .map_err(|e| anyhow!("join plan reset: {e}"))?; + + let mut result_stream = self + .join_exec_plan + .execute(0, SessionContext::new().task_ctx()) + .map_err(|e| anyhow!("join execute: {e}"))?; + + let mut outputs = Vec::new(); + while let Some(batch) = result_stream.next().await { + outputs.push(batch.map_err(|e| anyhow!("{e}"))?); + } + + Ok(outputs) + } + + async fn process_side( + &mut self, + side: JoinSide, + batch: RecordBatch, + ctx: &mut TaskContext, + ) -> Result> { + let current_time = ctx.current_watermark().unwrap_or_else(SystemTime::now); + + self.left_state.expire(current_time); + self.right_state.expire(current_time); + + match side { + JoinSide::Left => self.left_state.insert(batch.clone(), current_time), + JoinSide::Right => self.right_state.insert(batch.clone(), current_time), + } + + let opposite_batches = match side { + JoinSide::Left => self.right_state.get_all_batches(), + JoinSide::Right => self.left_state.get_all_batches(), + }; + + if opposite_batches.is_empty() { + return Ok(vec![]); + } + + let opposite_schema = match side { + JoinSide::Left => &self.right_schema.schema, + JoinSide::Right => &self.left_schema.schema, + }; + let combined_opposite_batch = concat_batches(opposite_schema, opposite_batches.iter())?; + + let unkeyed_target_batch = match side { + JoinSide::Left => self.left_input_schema.unkeyed_batch(&batch)?, + JoinSide::Right => self.right_input_schema.unkeyed_batch(&batch)?, + }; + + let (left_input, right_input) = match side { + JoinSide::Left => (unkeyed_target_batch, combined_opposite_batch), + JoinSide::Right => (combined_opposite_batch, unkeyed_target_batch), + }; + + let result_batches = self.compute_pair(left_input, right_input).await?; + + Ok(result_batches + .into_iter() + .map(StreamOutput::Forward) + .collect()) + } +} + +#[async_trait] +impl Operator for JoinWithExpirationOperator { + fn name(&self) -> &str { + "JoinWithExpiration" + } + + async fn on_start(&mut self, _ctx: &mut TaskContext) -> Result<()> { + Ok(()) + } + + async fn process_data( + &mut self, + input_idx: usize, + batch: RecordBatch, + ctx: &mut TaskContext, + ) -> Result> { + let side = if input_idx == 0 { + JoinSide::Left + } else { + JoinSide::Right + }; + self.process_side(side, batch, ctx).await + } + + async fn process_watermark( + &mut self, + _watermark: Watermark, + _ctx: &mut TaskContext, + ) -> Result> { + Ok(vec![]) + } + + async fn snapshot_state( + &mut self, + _barrier: CheckpointBarrier, + _ctx: &mut TaskContext, + ) -> Result<()> { + Ok(()) + } + + async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result> { + Ok(vec![]) + } +} + +// ============================================================================ +// ============================================================================ + +pub struct JoinWithExpirationConstructor; + +impl JoinWithExpirationConstructor { + pub fn with_config( + &self, + config: JoinOperator, + registry: Arc, + ) -> anyhow::Result { + let left_passer = Arc::new(RwLock::new(None)); + let right_passer = Arc::new(RwLock::new(None)); + + let codec = StreamingExtensionCodec { + context: StreamingDecodingContext::LockedJoinPair { + left: left_passer.clone(), + right: right_passer.clone(), + }, + }; + + let join_physical_plan_node = PhysicalPlanNode::decode(&mut config.join_plan.as_slice())?; + let join_exec_plan = join_physical_plan_node.try_into_physical_plan( + registry.as_ref(), + &RuntimeEnvBuilder::new().build()?, + &codec, + )?; + + let left_input_schema: FsSchema = config.left_schema.unwrap().try_into()?; + let right_input_schema: FsSchema = config.right_schema.unwrap().try_into()?; + let left_schema = left_input_schema.schema_without_keys()?; + let right_schema = right_input_schema.schema_without_keys()?; + + let mut ttl = Duration::from_micros( + config + .ttl_micros + .expect("ttl must be set for non-instant join"), + ); + + if ttl == Duration::ZERO { + warn!("TTL was not set for join with expiration, defaulting to 24 hours."); + ttl = Duration::from_secs(24 * 60 * 60); + } + + Ok(JoinWithExpirationOperator { + left_input_schema, + right_input_schema, + left_schema, + right_schema, + left_passer, + right_passer, + join_exec_plan, + left_state: StateBuffer::new(ttl), + right_state: StateBuffer::new(ttl), + }) + } +} diff --git a/src/runtime/streaming/operators/joins/mod.rs b/src/runtime/streaming/operators/joins/mod.rs new file mode 100644 index 00000000..1cc83d36 --- /dev/null +++ b/src/runtime/streaming/operators/joins/mod.rs @@ -0,0 +1,17 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod join_instance; +pub mod join_with_expiration; + +pub use join_instance::InstantJoinConstructor; +pub use join_with_expiration::JoinWithExpirationConstructor; diff --git a/src/runtime/streaming/operators/key_by.rs b/src/runtime/streaming/operators/key_by.rs new file mode 100644 index 00000000..59206688 --- /dev/null +++ b/src/runtime/streaming/operators/key_by.rs @@ -0,0 +1,162 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{Result, anyhow}; +use arrow::compute::{sort_to_indices, take}; +use arrow_array::{Array, RecordBatch, UInt64Array}; +use async_trait::async_trait; +use datafusion::physical_expr::PhysicalExpr; +use datafusion_common::hash_utils::create_hashes; +use datafusion_physical_expr::expressions::Column; +use std::sync::Arc; + +use crate::runtime::streaming::StreamOutput; +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::Operator; +use crate::sql::common::{CheckpointBarrier, Watermark}; + +use protocol::function_stream_graph::KeyPlanOperator; + +pub struct KeyByOperator { + name: String, + key_extractors: Vec>, + random_state: ahash::RandomState, +} + +impl KeyByOperator { + pub fn new(name: String, key_extractors: Vec>) -> Self { + Self { + name, + key_extractors, + random_state: ahash::RandomState::new(), + } + } +} + +#[async_trait] +impl Operator for KeyByOperator { + fn name(&self) -> &str { + &self.name + } + + async fn on_start(&mut self, _ctx: &mut TaskContext) -> Result<()> { + Ok(()) + } + + async fn process_data( + &mut self, + _input_idx: usize, + batch: RecordBatch, + _ctx: &mut TaskContext, + ) -> Result> { + let num_rows = batch.num_rows(); + if num_rows == 0 { + return Ok(vec![]); + } + + let mut key_columns = Vec::with_capacity(self.key_extractors.len()); + for expr in &self.key_extractors { + let column_array = expr + .evaluate(&batch) + .map_err(|e| anyhow!("Failed to evaluate key expr: {}", e))? + .into_array(num_rows) + .map_err(|e| anyhow!("Failed to convert into array: {}", e))?; + key_columns.push(column_array); + } + + let mut hash_buffer = vec![0u64; num_rows]; + create_hashes(&key_columns, &self.random_state, &mut hash_buffer) + .map_err(|e| anyhow!("Failed to compute hashes: {}", e))?; + + let hash_array = UInt64Array::from(hash_buffer); + + let sorted_indices = sort_to_indices(&hash_array, None, None) + .map_err(|e| anyhow!("Failed to sort hashes: {}", e))?; + + let sorted_hashes_ref = take(&hash_array, &sorted_indices, None)?; + let sorted_hashes = sorted_hashes_ref + .as_any() + .downcast_ref::() + .unwrap(); + + let sorted_columns: std::result::Result, _> = batch + .columns() + .iter() + .map(|col| take(col, &sorted_indices, None)) + .collect(); + let sorted_batch = RecordBatch::try_new(batch.schema(), sorted_columns?)?; + + let mut outputs = Vec::new(); + let mut start_idx = 0; + + while start_idx < num_rows { + let current_hash = sorted_hashes.value(start_idx); + let mut end_idx = start_idx + 1; + while end_idx < num_rows && sorted_hashes.value(end_idx) == current_hash { + end_idx += 1; + } + + let sub_batch = sorted_batch.slice(start_idx, end_idx - start_idx); + outputs.push(StreamOutput::Keyed(current_hash, sub_batch)); + start_idx = end_idx; + } + + Ok(outputs) + } + + async fn process_watermark( + &mut self, + watermark: Watermark, + _ctx: &mut TaskContext, + ) -> Result> { + Ok(vec![StreamOutput::Watermark(watermark)]) + } + + async fn snapshot_state( + &mut self, + _barrier: CheckpointBarrier, + _ctx: &mut TaskContext, + ) -> Result<()> { + Ok(()) + } + + async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result> { + Ok(vec![]) + } +} + +// --------------------------------------------------------------------------- +// Constructor +// --------------------------------------------------------------------------- + +pub struct KeyByConstructor; + +impl KeyByConstructor { + pub fn with_config(&self, config: KeyPlanOperator) -> Result { + let mut key_extractors: Vec> = + Vec::with_capacity(config.key_fields.len()); + + for field_idx in &config.key_fields { + let idx = *field_idx as usize; + let expr = Arc::new(Column::new(&format!("col_{}", idx), idx)) as Arc; + key_extractors.push(expr); + } + + let name = if config.name.is_empty() { + "KeyBy".to_string() + } else { + config.name.clone() + }; + + Ok(KeyByOperator::new(name, key_extractors)) + } +} diff --git a/src/runtime/streaming/operators/key_operator.rs b/src/runtime/streaming/operators/key_operator.rs new file mode 100644 index 00000000..1f4f48c6 --- /dev/null +++ b/src/runtime/streaming/operators/key_operator.rs @@ -0,0 +1,147 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Key-by over the physical plan output: key column(s) are **values** projected by the plan +//! (e.g. `_key_user_id`); **shuffle / `StreamOutput::Keyed` uses `u64` hashes** computed by +//! [`datafusion_common::hash_utils::create_hashes`] on those columns — same mechanism as +//! [`crate::runtime::streaming::operators::key_by::KeyByOperator`]. + +use crate::runtime::streaming::StreamOutput; +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::Operator; +use crate::runtime::streaming::operators::StatelessPhysicalExecutor; +use crate::sql::common::{CheckpointBarrier, Watermark}; +use ahash::RandomState; +use anyhow::{Result, anyhow}; +use arrow::compute::{sort_to_indices, take}; +use arrow_array::{Array, ArrayRef, RecordBatch, UInt64Array}; +use async_trait::async_trait; +use datafusion_common::hash_utils::create_hashes; +use futures::StreamExt; + +// =========================================================================== +// =========================================================================== + +pub struct KeyExecutionOperator { + name: String, + executor: StatelessPhysicalExecutor, + key_fields: Vec, + random_state: RandomState, +} + +impl KeyExecutionOperator { + pub fn new(name: String, executor: StatelessPhysicalExecutor, key_fields: Vec) -> Self { + let deterministic_random_state = RandomState::with_seeds( + 0x1234567890ABCDEF, + 0x0FEDCBA987654321, + 0x1357924680135792, + 0x2468013579246801, + ); + + Self { + name, + executor, + key_fields, + random_state: deterministic_random_state, + } + } +} + +#[async_trait] +impl Operator for KeyExecutionOperator { + fn name(&self) -> &str { + &self.name + } + + async fn process_data( + &mut self, + _input_idx: usize, + batch: RecordBatch, + _ctx: &mut TaskContext, + ) -> Result> { + let mut outputs = Vec::new(); + + let mut stream = self.executor.process_batch(batch).await?; + + while let Some(batch_result) = stream.next().await { + let out_batch = batch_result?; + let num_rows = out_batch.num_rows(); + if num_rows == 0 { + continue; + } + + let key_arrays: Vec = self + .key_fields + .iter() + .map(|&i| out_batch.column(i).clone()) + .collect(); + + let mut hash_buffer = vec![0u64; num_rows]; + create_hashes(&key_arrays, &self.random_state, &mut hash_buffer) + .map_err(|e| anyhow!("KeyExecution failed to hash columns: {e}"))?; + + let hash_array = UInt64Array::from(hash_buffer); + + let sorted_indices = sort_to_indices(&hash_array, None, None) + .map_err(|e| anyhow!("Failed to sort by hash: {e}"))?; + + let sorted_hashes_ref = take(&hash_array, &sorted_indices, None)?; + let sorted_hashes = sorted_hashes_ref + .as_any() + .downcast_ref::() + .unwrap(); + + let sorted_columns: Result, _> = out_batch + .columns() + .iter() + .map(|col| take(col, &sorted_indices, None)) + .collect(); + let sorted_batch = RecordBatch::try_new(out_batch.schema(), sorted_columns?)?; + + let mut start_idx = 0; + while start_idx < num_rows { + let current_hash = sorted_hashes.value(start_idx); + let mut end_idx = start_idx + 1; + + while end_idx < num_rows && sorted_hashes.value(end_idx) == current_hash { + end_idx += 1; + } + + let sub_batch = sorted_batch.slice(start_idx, end_idx - start_idx); + outputs.push(StreamOutput::Keyed(current_hash, sub_batch)); + + start_idx = end_idx; + } + } + Ok(outputs) + } + + async fn process_watermark( + &mut self, + watermark: Watermark, + _ctx: &mut TaskContext, + ) -> Result> { + Ok(vec![StreamOutput::Watermark(watermark)]) + } + + async fn snapshot_state( + &mut self, + _barrier: CheckpointBarrier, + _ctx: &mut TaskContext, + ) -> Result<()> { + Ok(()) + } + + async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result> { + Ok(vec![]) + } +} diff --git a/src/runtime/streaming/operators/mod.rs b/src/runtime/streaming/operators/mod.rs new file mode 100644 index 00000000..dd13dacb --- /dev/null +++ b/src/runtime/streaming/operators/mod.rs @@ -0,0 +1,30 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod grouping; +pub mod joins; +pub mod key_by; +mod key_operator; +pub mod projection; +pub mod sink; +pub mod source; +mod stateless_physical_executor; +mod value_execution; +pub mod watermark; +pub mod windows; + +pub use key_operator::KeyExecutionOperator; +pub use projection::ProjectionOperator; +pub use stateless_physical_executor::StatelessPhysicalExecutor; +pub use value_execution::ValueExecutionOperator; + +pub use grouping::{Key, UpdatingCache}; diff --git a/src/runtime/streaming/operators/projection.rs b/src/runtime/streaming/operators/projection.rs new file mode 100644 index 00000000..1a2ff3a1 --- /dev/null +++ b/src/runtime/streaming/operators/projection.rs @@ -0,0 +1,135 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{Result, anyhow}; +use arrow_array::RecordBatch; +use async_trait::async_trait; +use datafusion::physical_expr::PhysicalExpr; +use datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec; +use datafusion_proto::physical_plan::from_proto::parse_physical_expr; +use datafusion_proto::protobuf::PhysicalExprNode; +use prost::Message; +use std::sync::Arc; + +use protocol::function_stream_graph::ProjectionOperator as ProjectionOperatorProto; + +use crate::runtime::streaming::StreamOutput; +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::Operator; +use crate::runtime::streaming::factory::global::Registry; +use crate::sql::common::{CheckpointBarrier, FsSchema, FsSchemaRef, Watermark}; +use crate::sql::logical_node::logical::OperatorName; + +pub struct ProjectionOperator { + name: String, + output_schema: FsSchemaRef, + exprs: Vec>, +} + +impl ProjectionOperator { + pub fn new( + name: String, + output_schema: FsSchemaRef, + exprs: Vec>, + ) -> Self { + Self { + name, + output_schema, + exprs, + } + } + + pub fn from_proto(config: ProjectionOperatorProto, registry: Arc) -> Result { + let input_schema: FsSchema = config + .input_schema + .ok_or_else(|| anyhow!("missing projection input_schema"))? + .try_into() + .map_err(|e| anyhow!("projection input_schema: {e}"))?; + + let output_schema: FsSchema = config + .output_schema + .ok_or_else(|| anyhow!("missing projection output_schema"))? + .try_into() + .map_err(|e| anyhow!("projection output_schema: {e}"))?; + + let exprs = config + .exprs + .iter() + .map(|raw| { + let expr_node = PhysicalExprNode::decode(&mut raw.as_slice()) + .map_err(|e| anyhow!("decode projection expr: {e}"))?; + parse_physical_expr( + &expr_node, + registry.as_ref(), + &input_schema.schema, + &DefaultPhysicalExtensionCodec {}, + ) + .map_err(|e| anyhow!("parse projection expr: {e}")) + }) + .collect::>>()?; + + let name = if config.name.is_empty() { + OperatorName::Projection.as_registry_key().to_string() + } else { + config.name + }; + + Ok(Self::new(name, Arc::new(output_schema), exprs)) + } +} + +#[async_trait] +impl Operator for ProjectionOperator { + fn name(&self) -> &str { + &self.name + } + + async fn process_data( + &mut self, + _input_idx: usize, + batch: RecordBatch, + _ctx: &mut TaskContext, + ) -> Result> { + if batch.num_rows() == 0 { + return Ok(vec![]); + } + + let projected_columns = self + .exprs + .iter() + .map(|expr| { + expr.evaluate(&batch) + .and_then(|val| val.into_array(batch.num_rows())) + }) + .collect::>>()?; + + let out_batch = RecordBatch::try_new(self.output_schema.schema.clone(), projected_columns)?; + + Ok(vec![StreamOutput::Forward(out_batch)]) + } + + async fn process_watermark( + &mut self, + watermark: Watermark, + _ctx: &mut TaskContext, + ) -> Result> { + Ok(vec![StreamOutput::Watermark(watermark)]) + } + + async fn snapshot_state( + &mut self, + _barrier: CheckpointBarrier, + _ctx: &mut TaskContext, + ) -> Result<()> { + Ok(()) + } +} diff --git a/src/runtime/streaming/operators/sink/kafka/mod.rs b/src/runtime/streaming/operators/sink/kafka/mod.rs new file mode 100644 index 00000000..a24a098d --- /dev/null +++ b/src/runtime/streaming/operators/sink/kafka/mod.rs @@ -0,0 +1,369 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{Result, anyhow, bail}; +use arrow_array::Array; +use arrow_array::RecordBatch; +use arrow_array::cast::AsArray; +use arrow_schema::{DataType, TimeUnit}; +use async_trait::async_trait; +use rdkafka::ClientConfig; +use rdkafka::error::{KafkaError, RDKafkaErrorCode}; +use rdkafka::producer::{DeliveryFuture, FutureProducer, FutureRecord, Producer}; +use rdkafka::util::Timeout; +use std::collections::HashMap; +use std::time::Duration; +use tokio::time::sleep; +use tracing::{info, warn}; + +use crate::runtime::streaming::StreamOutput; +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::Operator; +use crate::runtime::streaming::format::DataSerializer; +use crate::sql::common::constants::factory_operator_name; +use crate::sql::common::{CheckpointBarrier, FsSchema, Watermark}; +// ============================================================================ +// ============================================================================ + +#[derive(Debug, Clone)] +pub enum ConsistencyMode { + AtLeastOnce, + ExactlyOnce, +} + +struct TransactionalState { + next_transaction_index: usize, + active_producer: FutureProducer, + producer_awaiting_commit: Option, +} + +// ============================================================================ +// ============================================================================ + +pub struct KafkaSinkOperator { + pub topic: String, + pub bootstrap_servers: String, + pub consistency_mode: ConsistencyMode, + pub client_config: HashMap, + + pub input_schema: FsSchema, + pub timestamp_col_idx: Option, + pub key_col_idx: Option, + + pub serializer: DataSerializer, + + at_least_once_producer: Option, + transactional_state: Option, + + write_futures: Vec, +} + +impl KafkaSinkOperator { + pub fn new( + topic: String, + bootstrap_servers: String, + consistency_mode: ConsistencyMode, + client_config: HashMap, + input_schema: FsSchema, + serializer: DataSerializer, + ) -> Self { + Self { + topic, + bootstrap_servers, + consistency_mode, + client_config, + input_schema, + timestamp_col_idx: None, + key_col_idx: None, + serializer, + at_least_once_producer: None, + transactional_state: None, + write_futures: Vec::new(), + } + } + + fn resolve_schema_indices(&mut self) { + self.timestamp_col_idx = Some(self.input_schema.timestamp_index); + + if let Some(routing_keys) = self.input_schema.routing_keys() + && !routing_keys.is_empty() + { + self.key_col_idx = Some(routing_keys[0]); + } + } + + fn create_producer( + &self, + ctx: &TaskContext, + tx_index: Option, + ) -> Result { + let mut config = ClientConfig::new(); + config.set("bootstrap.servers", &self.bootstrap_servers); + + for (k, v) in &self.client_config { + config.set(k, v); + } + + if let Some(idx) = tx_index { + config.set("enable.idempotence", "true"); + let transactional_id = format!( + "fs-tx-{}-{}-{}-{}", + ctx.job_id, self.topic, ctx.subtask_index, idx + ); + config.set("transactional.id", &transactional_id); + + let producer: FutureProducer = config.create()?; + producer + .init_transactions(Timeout::After(Duration::from_secs(30))) + .map_err(|e| anyhow!("Failed to init Kafka transactions: {}", e))?; + producer + .begin_transaction() + .map_err(|e| anyhow!("Failed to begin Kafka transaction: {}", e))?; + + Ok(producer) + } else { + Ok(config.create()?) + } + } + + async fn flush_to_broker(&mut self) -> Result<()> { + let producer = self.current_producer(); + + producer.poll(Timeout::After(Duration::ZERO)); + + for future in self.write_futures.drain(..) { + match future.await { + Ok(Ok(_)) => continue, + Ok(Err((e, _))) => bail!("Kafka producer delivery failed: {}", e), + Err(_) => bail!("Kafka delivery future canceled"), + } + } + Ok(()) + } + + fn current_producer(&self) -> &FutureProducer { + match &self.consistency_mode { + ConsistencyMode::AtLeastOnce => self.at_least_once_producer.as_ref().unwrap(), + ConsistencyMode::ExactlyOnce => { + &self.transactional_state.as_ref().unwrap().active_producer + } + } + } +} + +fn event_timestamp_ms(batch: &RecordBatch, row: usize, col: usize) -> Option { + let arr = batch.column(col); + match arr.data_type() { + DataType::Timestamp(TimeUnit::Second, _) => { + let a = arr.as_primitive::(); + (!a.is_null(row)).then(|| a.value(row) * 1000) + } + DataType::Timestamp(TimeUnit::Millisecond, _) => { + let a = arr.as_primitive::(); + (!a.is_null(row)).then(|| a.value(row)) + } + DataType::Timestamp(TimeUnit::Microsecond, _) => { + let a = arr.as_primitive::(); + (!a.is_null(row)).then(|| a.value(row) / 1000) + } + DataType::Timestamp(TimeUnit::Nanosecond, _) => { + let a = arr.as_primitive::(); + (!a.is_null(row)).then(|| a.value(row) / 1_000_000) + } + _ => None, + } +} + +fn row_key_bytes(batch: &RecordBatch, row: usize, col: usize) -> Option> { + let arr = batch.column(col); + match arr.data_type() { + DataType::Utf8 => { + let s = arr.as_string::(); + if s.is_null(row) { + None + } else { + Some(s.value(row).as_bytes().to_vec()) + } + } + DataType::LargeUtf8 => { + let s = arr.as_string::(); + if s.is_null(row) { + None + } else { + Some(s.value(row).as_bytes().to_vec()) + } + } + _ => None, + } +} + +// ============================================================================ +// ============================================================================ + +#[async_trait] +impl Operator for KafkaSinkOperator { + fn name(&self) -> &str { + factory_operator_name::KAFKA_SINK + } + + async fn on_start(&mut self, ctx: &mut TaskContext) -> Result<()> { + self.resolve_schema_indices(); + + match self.consistency_mode { + ConsistencyMode::AtLeastOnce => { + self.at_least_once_producer = Some(self.create_producer(ctx, None)?); + } + ConsistencyMode::ExactlyOnce => { + let mut next_idx = 0usize; + + let active_producer = self.create_producer(ctx, Some(next_idx))?; + next_idx += 1; + + self.transactional_state = Some(TransactionalState { + next_transaction_index: next_idx, + active_producer, + producer_awaiting_commit: None, + }); + } + } + Ok(()) + } + + async fn process_data( + &mut self, + _input_idx: usize, + batch: RecordBatch, + _ctx: &mut TaskContext, + ) -> Result> { + let payloads = self.serializer.serialize(&batch)?; + let producer = self.current_producer().clone(); + + for (i, payload) in payloads.iter().enumerate() { + let ts_millis = self + .timestamp_col_idx + .and_then(|idx| event_timestamp_ms(&batch, i, idx)); + let key_bytes = self + .key_col_idx + .and_then(|idx| row_key_bytes(&batch, i, idx)); + + let mut record = FutureRecord::, Vec>::to(&self.topic).payload(payload); + if let Some(ts) = ts_millis { + record = record.timestamp(ts); + } + if let Some(ref k) = key_bytes { + record = record.key(k); + } + + loop { + match producer.send_result(record) { + Ok(delivery_future) => { + self.write_futures.push(delivery_future); + break; + } + Err(( + KafkaError::MessageProduction(RDKafkaErrorCode::QueueFull), + returned_record, + )) => { + record = returned_record; + sleep(Duration::from_millis(10)).await; + } + Err((e, _)) => bail!("Fatal Kafka send error: {}", e), + } + } + } + + Ok(vec![]) + } + + async fn process_watermark( + &mut self, + _watermark: Watermark, + _ctx: &mut TaskContext, + ) -> Result> { + Ok(vec![]) + } + + async fn snapshot_state( + &mut self, + _barrier: CheckpointBarrier, + ctx: &mut TaskContext, + ) -> Result<()> { + self.flush_to_broker().await?; + + if matches!(self.consistency_mode, ConsistencyMode::ExactlyOnce) { + let next_tx = self + .transactional_state + .as_ref() + .map(|s| s.next_transaction_index) + .unwrap(); + let new_producer = self.create_producer(ctx, Some(next_tx))?; + + let state = self.transactional_state.as_mut().unwrap(); + let old_producer = std::mem::replace(&mut state.active_producer, new_producer); + state.producer_awaiting_commit = Some(old_producer); + + state.next_transaction_index += 1; + } + + Ok(()) + } + + async fn commit_checkpoint(&mut self, epoch: u32, _ctx: &mut TaskContext) -> Result<()> { + if matches!(self.consistency_mode, ConsistencyMode::AtLeastOnce) { + return Ok(()); + } + + let state = self.transactional_state.as_mut().unwrap(); + let Some(committing_producer) = state.producer_awaiting_commit.take() else { + warn!( + "Received Commit for epoch {}, but no stashed producer exists. Possibly a recovery duplicate.", + epoch + ); + return Ok(()); + }; + + let mut retries = 0; + loop { + match committing_producer.commit_transaction(Timeout::After(Duration::from_secs(10))) { + Ok(_) => { + info!( + "Successfully committed Kafka transaction for epoch {}", + epoch + ); + break; + } + Err(e) => { + retries += 1; + if retries >= 5 { + bail!( + "Failed to commit Kafka transaction after 5 retries. Fatal error: {}", + e + ); + } + warn!( + "Failed to commit Kafka transaction (Attempt {}/5): {}. Retrying...", + retries, e + ); + sleep(Duration::from_secs(2)).await; + } + } + } + + Ok(()) + } + + async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result> { + self.flush_to_broker().await?; + info!("Kafka sink shut down gracefully."); + Ok(vec![]) + } +} diff --git a/src/runtime/input/protocol/mod.rs b/src/runtime/streaming/operators/sink/mod.rs similarity index 100% rename from src/runtime/input/protocol/mod.rs rename to src/runtime/streaming/operators/sink/mod.rs diff --git a/src/runtime/streaming/operators/source/kafka/mod.rs b/src/runtime/streaming/operators/source/kafka/mod.rs new file mode 100644 index 00000000..e73d18fa --- /dev/null +++ b/src/runtime/streaming/operators/source/kafka/mod.rs @@ -0,0 +1,392 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{Context as _, Result, anyhow}; +use arrow_array::RecordBatch; +use arrow_schema::SchemaRef; +use async_trait::async_trait; +use bincode::{Decode, Encode}; +use governor::{DefaultDirectRateLimiter, Quota, RateLimiter as GovernorRateLimiter}; +use rdkafka::consumer::{CommitMode, Consumer, StreamConsumer}; +use rdkafka::{ClientConfig, Message as KMessage, Offset, TopicPartitionList}; +use std::collections::HashMap; +use std::num::NonZeroU32; +use std::time::{Duration, Instant}; +use tracing::{debug, error, info, warn}; + +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::source::{SourceEvent, SourceOffset, SourceOperator}; +use crate::runtime::streaming::format::{BadDataPolicy, DataDeserializer, Format}; +use crate::sql::common::fs_schema::FieldValueType; +use crate::sql::common::{CheckpointBarrier, MetadataField}; +// ============================================================================ +// ============================================================================ + +#[derive(Copy, Clone, Debug, Encode, Decode, PartialEq, PartialOrd)] +pub struct KafkaState { + partition: i32, + offset: i64, +} + +pub trait BatchDeserializer: Send + 'static { + fn deserialize_slice( + &mut self, + payload: &[u8], + timestamp: u64, + metadata: Option>>, + ) -> Result<()>; + + fn should_flush(&self) -> bool; + + fn flush_buffer(&mut self) -> Result>; + + fn is_empty(&self) -> bool; +} + +// --------------------------------------------------------------------------- +// --------------------------------------------------------------------------- + +pub struct BufferedDeserializer { + inner: DataDeserializer, + buffer: Vec>, + /// Parallel to `buffer`: Kafka message timestamp (ms) per row for filling `_timestamp`. + kafka_timestamps_ms: Vec, + batch_size: usize, +} + +impl BufferedDeserializer { + pub fn new( + format: Format, + schema: SchemaRef, + bad_data_policy: BadDataPolicy, + batch_size: usize, + ) -> Self { + Self { + inner: DataDeserializer::new(format, schema, bad_data_policy), + buffer: Vec::with_capacity(batch_size), + kafka_timestamps_ms: Vec::with_capacity(batch_size), + batch_size, + } + } +} + +impl BatchDeserializer for BufferedDeserializer { + fn deserialize_slice( + &mut self, + payload: &[u8], + timestamp: u64, + _metadata: Option>>, + ) -> Result<()> { + self.buffer.push(payload.to_vec()); + self.kafka_timestamps_ms.push(timestamp); + Ok(()) + } + + fn should_flush(&self) -> bool { + self.buffer.len() >= self.batch_size + } + + fn flush_buffer(&mut self) -> Result> { + if self.buffer.is_empty() { + return Ok(None); + } + + let refs: Vec<&[u8]> = self.buffer.iter().map(|v| v.as_slice()).collect(); + let batch = self + .inner + .deserialize_batch_with_kafka_timestamps(&refs, &self.kafka_timestamps_ms)?; + self.buffer.clear(); + self.kafka_timestamps_ms.clear(); + Ok(Some(batch)) + } + + fn is_empty(&self) -> bool { + self.buffer.is_empty() + } +} + +impl SourceOffset { + fn rdkafka_offset(self) -> Offset { + match self { + SourceOffset::Earliest => Offset::Beginning, + SourceOffset::Latest => Offset::End, + SourceOffset::Group => Offset::Stored, + } + } +} + +// ============================================================================ +// ============================================================================ + +const KAFKA_POLL_TIMEOUT: Duration = Duration::from_millis(100); +const MAX_BATCH_LINGER_TIME: Duration = Duration::from_millis(500); + +pub struct KafkaSourceOperator { + pub topic: String, + pub bootstrap_servers: String, + pub group_id: Option, + pub group_id_prefix: Option, + pub offset_mode: SourceOffset, + + pub client_configs: HashMap, + pub messages_per_second: NonZeroU32, + pub metadata_fields: Vec, + + consumer: Option, + rate_limiter: Option, + deserializer: Box, + + current_offsets: HashMap, + is_empty_assignment: bool, + + last_flush_time: Instant, +} + +impl KafkaSourceOperator { + #[allow(clippy::too_many_arguments)] + pub fn new( + topic: String, + bootstrap_servers: String, + group_id: Option, + group_id_prefix: Option, + offset_mode: SourceOffset, + client_configs: HashMap, + messages_per_second: NonZeroU32, + metadata_fields: Vec, + deserializer: Box, + ) -> Self { + Self { + topic, + bootstrap_servers, + group_id, + group_id_prefix, + offset_mode, + client_configs, + messages_per_second, + metadata_fields, + consumer: None, + rate_limiter: None, + deserializer, + current_offsets: HashMap::new(), + is_empty_assignment: false, + last_flush_time: Instant::now(), + } + } + + async fn init_and_assign_consumer(&mut self, ctx: &mut TaskContext) -> Result<()> { + info!("Creating kafka consumer for {}", self.bootstrap_servers); + let mut client_config = ClientConfig::new(); + + let group_id = match (&self.group_id, &self.group_id_prefix) { + (Some(gid), _) => gid.clone(), + (None, Some(prefix)) => { + format!("{}-fs-{}-{}", prefix, ctx.job_id, ctx.subtask_index) + } + (None, None) => format!("fs-{}-{}-consumer", ctx.job_id, ctx.subtask_index), + }; + + for (key, value) in &self.client_configs { + client_config.set(key, value); + } + + let consumer: StreamConsumer = client_config + .set("bootstrap.servers", &self.bootstrap_servers) + .set("enable.partition.eof", "false") + .set("enable.auto.commit", "false") + .set("group.id", &group_id) + .create()?; + + let has_state = false; + let state_map: HashMap = HashMap::new(); + + let metadata = consumer + .fetch_metadata(Some(&self.topic), Duration::from_secs(30)) + .context("Failed to fetch Kafka metadata")?; + + let topic_meta = metadata + .topics() + .iter() + .find(|t| t.name() == self.topic) + .ok_or_else(|| anyhow!("topic {} not in metadata", self.topic))?; + + let partitions = topic_meta.partitions(); + let mut our_partitions = HashMap::new(); + let pmax = ctx.parallelism.max(1) as i32; + + for p in partitions { + if p.id().rem_euclid(pmax) == ctx.subtask_index as i32 { + let offset = state_map + .get(&p.id()) + .map(|s| Offset::Offset(s.offset)) + .unwrap_or_else(|| { + if has_state { + Offset::Beginning + } else { + self.offset_mode.rdkafka_offset() + } + }); + our_partitions.insert((self.topic.clone(), p.id()), offset); + } + } + + if our_partitions.is_empty() { + warn!( + "[Task {}] Subscribed to no partitions. Entering idle mode.", + ctx.subtask_index + ); + self.is_empty_assignment = true; + } else { + let topic_partitions = TopicPartitionList::from_topic_map(&our_partitions)?; + consumer.assign(&topic_partitions)?; + } + + self.consumer = Some(consumer); + Ok(()) + } +} + +// ============================================================================ +// ============================================================================ + +#[async_trait] +impl SourceOperator for KafkaSourceOperator { + fn name(&self) -> &str { + &self.topic + } + + async fn on_start(&mut self, ctx: &mut TaskContext) -> Result<()> { + self.init_and_assign_consumer(ctx).await?; + self.rate_limiter = Some(GovernorRateLimiter::direct(Quota::per_second( + self.messages_per_second, + ))); + Ok(()) + } + + async fn fetch_next(&mut self, _ctx: &mut TaskContext) -> Result { + if self.is_empty_assignment { + return Ok(SourceEvent::Idle); + } + + let consumer = self + .consumer + .as_ref() + .ok_or_else(|| anyhow!("Kafka consumer not initialized"))?; + let rate_limiter = self + .rate_limiter + .as_ref() + .ok_or_else(|| anyhow!("rate limiter not initialized"))?; + + match tokio::time::timeout(KAFKA_POLL_TIMEOUT, consumer.recv()).await { + Ok(Ok(msg)) => { + let partition = msg.partition(); + let offset = msg.offset(); + let timestamp = msg.timestamp().to_millis().ok_or_else(|| { + anyhow!("Failed to read timestamp from Kafka record: message has no timestamp") + })?; + + self.current_offsets.insert(partition, offset); + + if let Some(payload) = msg.payload() { + let topic = msg.topic(); + + let connector_metadata = if !self.metadata_fields.is_empty() { + let mut meta = HashMap::new(); + for f in &self.metadata_fields { + meta.insert( + f.field_name.as_str(), + match f.key.as_str() { + "key" => FieldValueType::Bytes(msg.key()), + "offset_id" => FieldValueType::Int64(Some(msg.offset())), + "partition" => FieldValueType::Int32(Some(msg.partition())), + "topic" => FieldValueType::String(Some(topic)), + "timestamp" => FieldValueType::Int64(Some(timestamp)), + _ => continue, + }, + ); + } + Some(meta) + } else { + None + }; + + self.deserializer.deserialize_slice( + payload, + timestamp.max(0) as u64, + connector_metadata, + )?; + } else { + debug!( + "Received tombstone message at partition {} offset {}", + partition, offset + ); + } + + rate_limiter.until_ready().await; + + let should_flush_by_size = self.deserializer.should_flush(); + let should_flush_by_time = self.last_flush_time.elapsed() > MAX_BATCH_LINGER_TIME; + + if !self.deserializer.is_empty() + && (should_flush_by_size || should_flush_by_time) + && let Some(batch) = self.deserializer.flush_buffer()? + { + self.last_flush_time = Instant::now(); + return Ok(SourceEvent::Data(batch)); + } + + Ok(SourceEvent::Idle) + } + Ok(Err(e)) => { + error!("Kafka recv error: {}", e); + Err(anyhow!("Kafka error: {}", e)) + } + Err(_) => { + if !self.deserializer.is_empty() + && let Some(batch) = self.deserializer.flush_buffer()? + { + self.last_flush_time = Instant::now(); + return Ok(SourceEvent::Data(batch)); + } + Ok(SourceEvent::Idle) + } + } + } + + async fn snapshot_state( + &mut self, + _barrier: CheckpointBarrier, + ctx: &mut TaskContext, + ) -> Result<()> { + debug!("Source [{}] executing checkpoint", ctx.subtask_index); + + let mut topic_partitions = TopicPartitionList::new(); + for (&partition, &offset) in &self.current_offsets { + topic_partitions + .add_partition_offset(&self.topic, partition, Offset::Offset(offset)) + .map_err(|e| anyhow!("add_partition_offset: {e}"))?; + } + + if let Some(consumer) = &self.consumer + && let Err(e) = consumer.commit(&topic_partitions, CommitMode::Async) + { + warn!("Failed to commit async offset to Kafka Broker: {:?}", e); + } + + Ok(()) + } + + async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result<()> { + info!("Kafka source shutting down gracefully"); + self.consumer.take(); + Ok(()) + } +} diff --git a/src/runtime/streaming/operators/source/mod.rs b/src/runtime/streaming/operators/source/mod.rs new file mode 100644 index 00000000..b9574391 --- /dev/null +++ b/src/runtime/streaming/operators/source/mod.rs @@ -0,0 +1,13 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod kafka; diff --git a/src/runtime/streaming/operators/stateless_physical_executor.rs b/src/runtime/streaming/operators/stateless_physical_executor.rs new file mode 100644 index 00000000..eb595d31 --- /dev/null +++ b/src/runtime/streaming/operators/stateless_physical_executor.rs @@ -0,0 +1,87 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::{Arc, RwLock}; + +use anyhow::{Result, anyhow}; +use arrow_array::RecordBatch; +use datafusion::execution::SendableRecordBatchStream; +use datafusion::execution::TaskContext; +use datafusion::execution::context::SessionContext; +use datafusion::execution::runtime_env::RuntimeEnvBuilder; +use datafusion::physical_plan::ExecutionPlan; +use datafusion_proto::physical_plan::AsExecutionPlan; +use datafusion_proto::protobuf::PhysicalPlanNode; +use futures::StreamExt; +use prost::Message; + +use crate::runtime::streaming::factory::Registry; +use crate::sql::physical::{StreamingDecodingContext, StreamingExtensionCodec}; + +pub struct StatelessPhysicalExecutor { + batch: Arc>>, + plan: Arc, + task_context: Arc, +} + +impl StatelessPhysicalExecutor { + pub fn new(mut proto: &[u8], registry: &Registry) -> Result { + let batch = Arc::new(RwLock::default()); + + let plan_node = PhysicalPlanNode::decode(&mut proto) + .map_err(|e| anyhow!("decode PhysicalPlanNode: {e}"))?; + let codec = StreamingExtensionCodec { + context: StreamingDecodingContext::SingleLockedBatch(batch.clone()), + }; + + let plan = plan_node.try_into_physical_plan( + registry, + &RuntimeEnvBuilder::new().build()?, + &codec, + )?; + + Ok(Self { + batch, + plan, + task_context: SessionContext::new().task_ctx(), + }) + } + + pub async fn process_batch(&mut self, batch: RecordBatch) -> Result { + { + let mut writer = self + .batch + .write() + .map_err(|e| anyhow!("SingleLockedBatch lock: {e}"))?; + *writer = Some(batch); + } + self.plan + .reset() + .map_err(|e| anyhow!("reset execution plan: {e}"))?; + self.plan + .execute(0, self.task_context.clone()) + .map_err(|e| anyhow!("failed to compute plan: {e}")) + } + + pub async fn process_single(&mut self, batch: RecordBatch) -> Result { + let mut stream = self.process_batch(batch).await?; + let result = stream + .next() + .await + .ok_or_else(|| anyhow!("empty output stream"))??; + anyhow::ensure!( + stream.next().await.is_none(), + "expected exactly one output batch" + ); + Ok(result) + } +} diff --git a/src/runtime/streaming/operators/value_execution.rs b/src/runtime/streaming/operators/value_execution.rs new file mode 100644 index 00000000..ff952dda --- /dev/null +++ b/src/runtime/streaming/operators/value_execution.rs @@ -0,0 +1,75 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::Result; +use arrow_array::RecordBatch; +use async_trait::async_trait; +use futures::StreamExt; + +use crate::runtime::streaming::StreamOutput; +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::Operator; +use crate::runtime::streaming::operators::StatelessPhysicalExecutor; +use crate::sql::common::{CheckpointBarrier, Watermark}; + +pub struct ValueExecutionOperator { + name: String, + executor: StatelessPhysicalExecutor, +} + +impl ValueExecutionOperator { + pub fn new(name: String, executor: StatelessPhysicalExecutor) -> Self { + Self { name, executor } + } +} + +#[async_trait] +impl Operator for ValueExecutionOperator { + fn name(&self) -> &str { + &self.name + } + + async fn process_data( + &mut self, + _input_idx: usize, + batch: RecordBatch, + _ctx: &mut TaskContext, + ) -> Result> { + let mut outputs = Vec::new(); + + let mut stream = self.executor.process_batch(batch).await?; + + while let Some(batch_result) = stream.next().await { + let out_batch = batch_result?; + if out_batch.num_rows() > 0 { + outputs.push(StreamOutput::Forward(out_batch)); + } + } + Ok(outputs) + } + + async fn process_watermark( + &mut self, + watermark: Watermark, + _ctx: &mut TaskContext, + ) -> Result> { + Ok(vec![StreamOutput::Watermark(watermark)]) + } + + async fn snapshot_state( + &mut self, + _barrier: CheckpointBarrier, + _ctx: &mut TaskContext, + ) -> Result<()> { + Ok(()) + } +} diff --git a/src/runtime/streaming/operators/watermark/mod.rs b/src/runtime/streaming/operators/watermark/mod.rs new file mode 100644 index 00000000..3a0a1099 --- /dev/null +++ b/src/runtime/streaming/operators/watermark/mod.rs @@ -0,0 +1,15 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod watermark_generator; + +pub use watermark_generator::WatermarkGeneratorConstructor; diff --git a/src/runtime/streaming/operators/watermark/watermark_generator.rs b/src/runtime/streaming/operators/watermark/watermark_generator.rs new file mode 100644 index 00000000..b74a92f2 --- /dev/null +++ b/src/runtime/streaming/operators/watermark/watermark_generator.rs @@ -0,0 +1,236 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{Result, anyhow}; +use arrow::compute::kernels::aggregate; +use arrow_array::cast::AsArray; +use arrow_array::types::TimestampNanosecondType; +use arrow_array::{RecordBatch, TimestampNanosecondArray}; +use bincode::{Decode, Encode}; +use datafusion::physical_expr::PhysicalExpr; +use datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec; +use datafusion_proto::physical_plan::from_proto::parse_physical_expr; +use datafusion_proto::protobuf::PhysicalExprNode; +use prost::Message; +use std::sync::Arc; +use std::time::{Duration, SystemTime}; +use tracing::{debug, info}; + +use crate::runtime::streaming::StreamOutput; +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::Operator; +use crate::runtime::streaming::factory::Registry; +use crate::sql::common::{CheckpointBarrier, FsSchema, Watermark, from_nanos, to_millis}; +use async_trait::async_trait; +use protocol::function_stream_graph::ExpressionWatermarkConfig; + +#[derive(Debug, Copy, Clone, Encode, Decode, PartialEq, Eq)] +pub struct WatermarkGeneratorState { + pub last_watermark_emitted_at: SystemTime, + pub max_watermark: SystemTime, +} + +impl Default for WatermarkGeneratorState { + fn default() -> Self { + Self { + last_watermark_emitted_at: SystemTime::UNIX_EPOCH, + max_watermark: SystemTime::UNIX_EPOCH, + } + } +} + +pub struct WatermarkGeneratorOperator { + interval: Duration, + idle_time: Option, + expression: Arc, + timestamp_index: usize, + state: WatermarkGeneratorState, + last_event_wall: SystemTime, + is_idle: bool, +} + +impl WatermarkGeneratorOperator { + pub fn new( + interval: Duration, + idle_time: Option, + expression: Arc, + timestamp_index: usize, + ) -> Self { + Self { + interval, + idle_time, + expression, + timestamp_index, + state: WatermarkGeneratorState::default(), + last_event_wall: SystemTime::now(), + is_idle: false, + } + } + + fn extract_max_timestamp(&self, batch: &RecordBatch) -> Option { + let ts_column = batch.column(self.timestamp_index); + let arr = ts_column.as_primitive::(); + let max_ts = aggregate::max(arr)?; + Some(from_nanos(max_ts as u128)) + } + + fn evaluate_watermark(&self, batch: &RecordBatch) -> Result { + let watermark_array = self + .expression + .evaluate(batch)? + .into_array(batch.num_rows())?; + + let typed_array = watermark_array + .as_any() + .downcast_ref::() + .ok_or_else(|| anyhow!("watermark expression must return TimestampNanosecondArray"))?; + + let max_watermark_nanos = aggregate::max(typed_array) + .ok_or_else(|| anyhow!("failed to extract max watermark from batch"))?; + + Ok(from_nanos(max_watermark_nanos as u128)) + } +} + +#[async_trait] +impl Operator for WatermarkGeneratorOperator { + fn name(&self) -> &str { + "ExpressionWatermarkGenerator" + } + + fn tick_interval(&self) -> Option { + Some(Duration::from_secs(1)) + } + + async fn on_start(&mut self, _ctx: &mut TaskContext) -> Result<()> { + self.last_event_wall = SystemTime::now(); + Ok(()) + } + + async fn process_data( + &mut self, + _input_idx: usize, + batch: RecordBatch, + ctx: &mut TaskContext, + ) -> Result> { + self.last_event_wall = SystemTime::now(); + + let mut outputs = vec![StreamOutput::Forward(batch.clone())]; + + let Some(max_batch_ts) = self.extract_max_timestamp(&batch) else { + return Ok(outputs); + }; + + let new_watermark = self.evaluate_watermark(&batch)?; + + self.state.max_watermark = self.state.max_watermark.max(new_watermark); + + let time_since_last_emit = max_batch_ts + .duration_since(self.state.last_watermark_emitted_at) + .unwrap_or(Duration::ZERO); + + if self.is_idle || time_since_last_emit > self.interval { + debug!( + "[{}] emitting expression watermark {}", + ctx.subtask_index, + to_millis(self.state.max_watermark) + ); + + outputs.push(StreamOutput::Watermark(Watermark::EventTime( + self.state.max_watermark, + ))); + + self.state.last_watermark_emitted_at = max_batch_ts; + self.is_idle = false; + } + + Ok(outputs) + } + + async fn process_watermark( + &mut self, + _watermark: Watermark, + _ctx: &mut TaskContext, + ) -> Result> { + Ok(vec![]) + } + + async fn process_tick( + &mut self, + _tick_index: u64, + ctx: &mut TaskContext, + ) -> Result> { + if let Some(idle_timeout) = self.idle_time { + let elapsed = self.last_event_wall.elapsed().unwrap_or(Duration::ZERO); + if !self.is_idle && elapsed > idle_timeout { + info!( + "task [{}] entering Idle after {:?}", + ctx.subtask_index, idle_timeout + ); + self.is_idle = true; + return Ok(vec![StreamOutput::Watermark(Watermark::Idle)]); + } + } + Ok(vec![]) + } + + async fn snapshot_state( + &mut self, + _barrier: CheckpointBarrier, + _ctx: &mut TaskContext, + ) -> Result<()> { + Ok(()) + } + + async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result> { + Ok(vec![StreamOutput::Watermark(Watermark::EventTime( + from_nanos(u64::MAX as u128), + ))]) + } +} + +pub struct WatermarkGeneratorConstructor; + +impl WatermarkGeneratorConstructor { + pub fn with_config( + &self, + config: ExpressionWatermarkConfig, + registry: Arc, + ) -> anyhow::Result { + let input_schema: FsSchema = config + .input_schema + .ok_or_else(|| anyhow!("missing input schema"))? + .try_into() + .map_err(|e| anyhow!("input schema: {e}"))?; + let timestamp_index = input_schema.timestamp_index; + + let expression_node = PhysicalExprNode::decode(&mut config.expression.as_slice()) + .map_err(|e| anyhow!("decode expression: {e}"))?; + let expression = parse_physical_expr( + &expression_node, + registry.as_ref(), + &input_schema.schema, + &DefaultPhysicalExtensionCodec {}, + ) + .map_err(|e| anyhow!("parse physical expr: {e}"))?; + + let interval = Duration::from_micros(config.period_micros); + let idle_time = config.idle_time_micros.map(Duration::from_micros); + + Ok(WatermarkGeneratorOperator::new( + interval, + idle_time, + expression, + timestamp_index, + )) + } +} diff --git a/src/runtime/streaming/operators/windows/mod.rs b/src/runtime/streaming/operators/windows/mod.rs new file mode 100644 index 00000000..f1915f0d --- /dev/null +++ b/src/runtime/streaming/operators/windows/mod.rs @@ -0,0 +1,21 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod session_aggregating_window; +pub mod sliding_aggregating_window; +pub mod tumbling_aggregating_window; +pub mod window_function; + +pub use session_aggregating_window::SessionAggregatingWindowConstructor; +pub use sliding_aggregating_window::SlidingAggregatingWindowConstructor; +pub use tumbling_aggregating_window::TumblingAggregateWindowConstructor; +pub use window_function::WindowFunctionConstructor; diff --git a/src/runtime/streaming/operators/windows/session_aggregating_window.rs b/src/runtime/streaming/operators/windows/session_aggregating_window.rs new file mode 100644 index 00000000..4293ea7c --- /dev/null +++ b/src/runtime/streaming/operators/windows/session_aggregating_window.rs @@ -0,0 +1,802 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{Context, Result, anyhow, bail}; +use arrow::compute::{ + concat_batches, filter_record_batch, kernels::cmp::gt_eq, lexsort_to_indices, partition, take, +}; +use arrow::row::{RowConverter, SortField}; +use arrow_array::types::TimestampNanosecondType; +use arrow_array::{ + Array, ArrayRef, BooleanArray, PrimitiveArray, RecordBatch, StructArray, + TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, + TimestampSecondArray, +}; +use arrow_schema::{DataType, Field, FieldRef, Schema, TimeUnit}; +use datafusion::execution::SendableRecordBatchStream; +use datafusion::execution::context::SessionContext; +use datafusion::execution::runtime_env::RuntimeEnvBuilder; +use datafusion::physical_plan::ExecutionPlan; +use datafusion_proto::physical_plan::AsExecutionPlan; +use datafusion_proto::protobuf::PhysicalPlanNode; +use futures::StreamExt; +use prost::Message; +use std::collections::{BTreeMap, HashMap, HashSet}; +use std::sync::{Arc, RwLock}; +use std::time::{Duration, SystemTime}; +use tokio::sync::mpsc::{UnboundedReceiver, UnboundedSender, unbounded_channel}; + +use crate::runtime::streaming::StreamOutput; +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::Operator; +use crate::runtime::streaming::factory::Registry; +use crate::sql::common::converter::Converter; +use crate::sql::common::{ + CheckpointBarrier, FsSchema, FsSchemaRef, Watermark, from_nanos, to_nanos, +}; +use crate::sql::physical::{StreamingDecodingContext, StreamingExtensionCodec}; +use crate::sql::schema::utils::window_arrow_struct; +use async_trait::async_trait; +use protocol::function_stream_graph::SessionWindowAggregateOperator; +// ============================================================================ +// ============================================================================ + +struct SessionWindowConfig { + gap: Duration, + input_schema_ref: FsSchemaRef, + window_field: FieldRef, + window_index: usize, + final_physical_exec: Arc, + receiver_hook: Arc>>>, + output_schema: Arc, +} + +struct ActiveSession { + data_start: SystemTime, + data_end: SystemTime, + sender: Option>, + result_stream: SendableRecordBatchStream, +} + +impl ActiveSession { + async fn new( + aggregation_plan: Arc, + initial_timestamp: SystemTime, + sender: UnboundedSender, + ) -> Result { + aggregation_plan.reset()?; + let result_exec = aggregation_plan.execute(0, SessionContext::new().task_ctx())?; + Ok(Self { + data_start: initial_timestamp, + data_end: initial_timestamp, + sender: Some(sender), + result_stream: result_exec, + }) + } + + fn ingest_batch( + &mut self, + batch: RecordBatch, + gap: Duration, + ts_idx: usize, + ) -> Result> { + let ts_col = batch + .column(ts_idx) + .as_any() + .downcast_ref::() + .ok_or_else(|| anyhow!("expected timestamp column"))?; + let start_ts = ts_col.value(0); + let end_ts = ts_col.value(batch.num_rows() - 1); + + let current_end_with_gap = to_nanos(self.data_end + gap) as i64; + + if end_ts < current_end_with_gap { + self.data_end = self.data_end.max(from_nanos(end_ts as u128)); + self.data_start = self.data_start.min(from_nanos(start_ts as u128)); + self.sender + .as_ref() + .ok_or_else(|| anyhow!("session sender already closed"))? + .send(batch) + .map_err(|e| anyhow!("session channel send: {e}"))?; + return Ok(None); + } + + if current_end_with_gap < start_ts { + return Ok(Some((from_nanos(start_ts as u128), batch))); + } + + self.data_start = self.data_start.min(from_nanos(start_ts as u128)); + + let mut split_idx = 1; + while split_idx < batch.num_rows() { + let val = ts_col.value(split_idx); + if val < to_nanos(self.data_end) as i64 { + split_idx += 1; + continue; + } + if val < to_nanos(self.data_end + gap) as i64 { + self.data_end = from_nanos(val as u128); + split_idx += 1; + continue; + } + break; + } + + if split_idx == batch.num_rows() { + self.sender + .as_ref() + .ok_or_else(|| anyhow!("session sender already closed"))? + .send(batch) + .map_err(|e| anyhow!("session channel send: {e}"))?; + return Ok(None); + } + + self.sender + .as_ref() + .ok_or_else(|| anyhow!("session sender already closed"))? + .send(batch.slice(0, split_idx)) + .map_err(|e| anyhow!("session channel send: {e}"))?; + let remaining_batch = batch.slice(split_idx, batch.num_rows() - split_idx); + let new_start_time = from_nanos(ts_col.value(split_idx) as u128); + Ok(Some((new_start_time, remaining_batch))) + } + + async fn close_and_drain(mut self, gap: Duration) -> Result { + self.sender.take(); + + let mut result_batches = Vec::new(); + while let Some(batch) = self.result_stream.next().await { + result_batches.push(batch?); + } + + if result_batches.len() != 1 || result_batches[0].num_rows() != 1 { + bail!("active session must yield exactly one aggregate row"); + } + + Ok(SessionWindowResult { + window_start: self.data_start, + window_end: self.data_end + gap, + batch: result_batches.into_iter().next().unwrap(), + }) + } +} + +struct SessionWindowResult { + window_start: SystemTime, + window_end: SystemTime, + batch: RecordBatch, +} + +struct KeySessionState { + config: Arc, + active_session: Option, + buffered_batches: BTreeMap>, +} + +impl KeySessionState { + fn new(config: Arc) -> Self { + Self { + config, + active_session: None, + buffered_batches: BTreeMap::new(), + } + } + + fn is_empty(&self) -> bool { + self.active_session.is_none() && self.buffered_batches.is_empty() + } + + fn earliest_data_time(&self) -> Option { + self.active_session + .as_ref() + .map(|s| s.data_start) + .or_else(|| self.buffered_batches.keys().next().copied()) + } + + fn next_watermark_action_time(&self) -> Option { + self.active_session + .as_ref() + .map(|s| s.data_end + self.config.gap) + .or_else(|| { + self.buffered_batches + .keys() + .next() + .map(|t| *t - self.config.gap) + }) + } + + async fn advance_by_watermark( + &mut self, + watermark: SystemTime, + ) -> Result> { + let mut results = vec![]; + + loop { + if let Some(session) = &mut self.active_session { + if session.data_end + self.config.gap < watermark { + let closed_session = self + .active_session + .take() + .unwrap() + .close_and_drain(self.config.gap) + .await?; + results.push(closed_session); + } else { + break; + } + } else { + let Some((initial_ts, _)) = self.buffered_batches.first_key_value() else { + break; + }; + if watermark + self.config.gap < *initial_ts { + break; + } + + let (tx, rx) = unbounded_channel(); + *self.config.receiver_hook.write().unwrap() = Some(rx); + + self.active_session = Some( + ActiveSession::new(self.config.final_physical_exec.clone(), *initial_ts, tx) + .await?, + ); + + self.drain_buffer_to_active_session()?; + } + } + Ok(results) + } + + fn drain_buffer_to_active_session(&mut self) -> Result<()> { + let session = self + .active_session + .as_mut() + .ok_or_else(|| anyhow!("drain_buffer_to_active_session without active session"))?; + + while let Some((first_key, _)) = self.buffered_batches.first_key_value() { + if session.data_end + self.config.gap < *first_key { + break; + } + + let (_, batches) = self.buffered_batches.pop_first().unwrap(); + for batch in batches { + if let Some((rem_start, rem_batch)) = session.ingest_batch( + batch, + self.config.gap, + self.config.input_schema_ref.timestamp_index, + )? { + self.buffered_batches + .entry(rem_start) + .or_default() + .push(rem_batch); + } + } + } + Ok(()) + } + + async fn add_data( + &mut self, + start_time: SystemTime, + batch: RecordBatch, + watermark: Option, + ) -> Result<()> { + self.buffered_batches + .entry(start_time) + .or_default() + .push(batch); + + if self.active_session.is_some() { + self.drain_buffer_to_active_session()?; + } + + if let Some(wm) = watermark { + let flushed = self.advance_by_watermark(wm).await?; + if !flushed.is_empty() { + bail!( + "unexpected flush during data ingestion; session watermark invariant violated" + ); + } + } + Ok(()) + } +} + +fn start_time_for_sorted_batch(batch: &RecordBatch, schema: &FsSchema) -> SystemTime { + let timestamp_array = batch.column(schema.timestamp_index); + let timestamp_array = timestamp_array + .as_any() + .downcast_ref::>() + .expect("timestamp column"); + from_nanos(timestamp_array.value(0) as u128) +} + +/// Appends the stream `_timestamp` column (see [`build_session_output_schema`]) using each +/// session's `window_end` as the row event time. +fn append_output_timestamp_column( + columns: &mut Vec, + session_results: &[SessionWindowResult], + ts_field: &Field, +) -> Result<()> { + let nanos = |r: &SessionWindowResult| to_nanos(r.window_end) as i64 - 1; + match ts_field.data_type() { + DataType::Timestamp(TimeUnit::Second, tz) => { + let v: Vec = session_results + .iter() + .map(|r| nanos(r) / 1_000_000_000) + .collect(); + columns.push(Arc::new( + TimestampSecondArray::from(v).with_timezone_opt(tz.clone()), + )); + } + DataType::Timestamp(TimeUnit::Millisecond, tz) => { + let v: Vec = session_results + .iter() + .map(|r| nanos(r) / 1_000_000) + .collect(); + columns.push(Arc::new( + TimestampMillisecondArray::from(v).with_timezone_opt(tz.clone()), + )); + } + DataType::Timestamp(TimeUnit::Microsecond, tz) => { + let v: Vec = session_results.iter().map(|r| nanos(r) / 1000).collect(); + columns.push(Arc::new( + TimestampMicrosecondArray::from(v).with_timezone_opt(tz.clone()), + )); + } + DataType::Timestamp(TimeUnit::Nanosecond, tz) => { + let v: Vec = session_results.iter().map(nanos).collect(); + columns.push(Arc::new( + TimestampNanosecondArray::from(v).with_timezone_opt(tz.clone()), + )); + } + dt => bail!("unsupported timestamp type for session window output: {dt}"), + } + Ok(()) +} + +fn build_session_output_schema( + input: &FsSchema, + window_field: FieldRef, + window_index: usize, + agg_schema: &Schema, +) -> Result> { + let key_count = input.routing_keys().map(|k| k.len()).unwrap_or(0); + let mut fields: Vec = (0..key_count) + .map(|i| input.schema.fields()[i].clone()) + .collect(); + fields.insert(window_index, window_field); + fields.extend(agg_schema.fields().iter().cloned()); + fields.push(input.schema.fields()[input.timestamp_index].clone()); + Ok(Arc::new(Schema::new(fields))) +} + +// ============================================================================ +// ============================================================================ + +pub struct SessionWindowOperator { + config: Arc, + row_converter: Converter, + + session_states: HashMap, KeySessionState>, + pq_watermark_actions: BTreeMap>>, + pq_start_times: BTreeMap>>, +} + +impl SessionWindowOperator { + fn filter_batch_by_time( + &self, + batch: RecordBatch, + watermark: Option, + ) -> Result { + let Some(watermark) = watermark else { + return Ok(batch); + }; + + let timestamp_column = batch + .column(self.config.input_schema_ref.timestamp_index) + .as_any() + .downcast_ref::() + .ok_or_else(|| anyhow!("expected timestamp column"))?; + + let watermark_scalar = TimestampNanosecondArray::new_scalar(to_nanos(watermark) as i64); + let on_time = gt_eq(timestamp_column, &watermark_scalar)?; + + Ok(filter_record_batch(&batch, &on_time)?) + } + + fn sort_batch(&self, batch: &RecordBatch) -> Result { + let sort_columns = self.config.input_schema_ref.sort_columns(batch, true); + let sort_indices = lexsort_to_indices(&sort_columns, None)?; + + let columns = batch + .columns() + .iter() + .map(|c| take(c, &sort_indices, None).unwrap()) + .collect(); + + Ok(RecordBatch::try_new(batch.schema(), columns)?) + } + + async fn ingest_sorted_batch( + &mut self, + sorted_batch: RecordBatch, + watermark: Option, + ) -> Result<()> { + let partition_ranges = if !self.config.input_schema_ref.has_routing_keys() { + std::iter::once(0..sorted_batch.num_rows()).collect::>() + } else { + let key_len = self + .config + .input_schema_ref + .routing_keys() + .as_ref() + .unwrap() + .len(); + let key_cols = sorted_batch + .columns() + .iter() + .take(key_len) + .cloned() + .collect::>(); + partition(key_cols.as_slice())?.ranges() + }; + + let key_count = self + .config + .input_schema_ref + .routing_keys() + .map(|k| k.len()) + .unwrap_or(0); + + for range in partition_ranges { + let key_batch = sorted_batch.slice(range.start, range.end - range.start); + + let row_key = if key_count == 0 { + Vec::new() + } else { + self.row_converter + .convert_columns(&key_batch.slice(0, 1).columns()[0..key_count]) + .context("row key convert")? + .as_ref() + .to_vec() + }; + + let state = self + .session_states + .entry(row_key.clone()) + .or_insert_with(|| KeySessionState::new(self.config.clone())); + + let initial_action = state.next_watermark_action_time(); + let initial_start = state.earliest_data_time(); + + let batch_start = + start_time_for_sorted_batch(&key_batch, &self.config.input_schema_ref); + + state.add_data(batch_start, key_batch, watermark).await?; + + let new_action = state + .next_watermark_action_time() + .ok_or_else(|| anyhow!("missing next watermark action after add_data"))?; + let new_start = state + .earliest_data_time() + .ok_or_else(|| anyhow!("missing earliest data after add_data"))?; + + match initial_action { + Some(ia) => { + if ia != new_action { + self.pq_watermark_actions + .get_mut(&ia) + .expect("pq watermark entry") + .remove(&row_key); + self.pq_watermark_actions + .entry(new_action) + .or_default() + .insert(row_key.clone()); + } + let is = initial_start.expect("initial start"); + if is != new_start { + self.pq_start_times + .get_mut(&is) + .expect("pq start entry") + .remove(&row_key); + self.pq_start_times + .entry(new_start) + .or_default() + .insert(row_key.clone()); + } + } + None => { + self.pq_watermark_actions + .entry(new_action) + .or_default() + .insert(row_key.clone()); + self.pq_start_times + .entry(new_start) + .or_default() + .insert(row_key); + } + } + } + Ok(()) + } + + async fn evaluate_watermark(&mut self, watermark: SystemTime) -> Result> { + let mut emit_results: Vec<(Vec, Vec)> = Vec::new(); + + loop { + let popped_action_time = match self.pq_watermark_actions.first_key_value() { + Some((t, _)) if *t < watermark => *t, + _ => break, + }; + let keys = self + .pq_watermark_actions + .remove(&popped_action_time) + .expect("pop watermark pq"); + + for key in keys { + let state = self + .session_states + .get_mut(&key) + .ok_or_else(|| anyhow!("missing session state for key"))?; + let initial_start = state + .earliest_data_time() + .ok_or_else(|| anyhow!("missing earliest data in evaluate_watermark"))?; + + let completed_sessions = state.advance_by_watermark(watermark).await?; + if !completed_sessions.is_empty() { + emit_results.push((key.clone(), completed_sessions)); + } + + self.pq_start_times + .get_mut(&initial_start) + .expect("pq start") + .remove(&key); + + if state.is_empty() { + self.session_states.remove(&key); + } else { + let new_start = state.earliest_data_time().expect("earliest after advance"); + self.pq_start_times + .entry(new_start) + .or_default() + .insert(key.clone()); + + let new_next_action = state + .next_watermark_action_time() + .expect("next action after advance"); + if new_next_action == popped_action_time { + bail!( + "processed watermark at {:?} but next watermark action stayed at {:?}", + watermark, + popped_action_time + ); + } + self.pq_watermark_actions + .entry(new_next_action) + .or_default() + .insert(key); + } + } + } + + if emit_results.is_empty() { + return Ok(vec![]); + } + + Ok(vec![self.format_to_arrow(emit_results)?]) + } + + fn format_to_arrow( + &self, + results: Vec<(Vec, Vec)>, + ) -> Result { + let (rows, session_results): (Vec<_>, Vec<_>) = results + .into_iter() + .flat_map(|(row, s_results)| s_results.into_iter().map(move |res| (row.clone(), res))) + .unzip(); + + let key_columns = if let Some(parser) = self.row_converter.parser() { + self.row_converter + .convert_rows(rows.iter().map(|row| parser.parse(row.as_ref())).collect())? + } else { + vec![] + }; + + let start_times: Vec = session_results + .iter() + .map(|r| to_nanos(r.window_start) as i64) + .collect(); + let end_times: Vec = session_results + .iter() + .map(|r| to_nanos(r.window_end) as i64) + .collect(); + + let window_start_array = PrimitiveArray::::from(start_times); + let window_end_array = PrimitiveArray::::from(end_times.clone()); + + let result_batches: Vec<&RecordBatch> = + session_results.iter().map(|res| &res.batch).collect(); + let merged_batch = concat_batches(&session_results[0].batch.schema(), result_batches)?; + + let DataType::Struct(window_fields) = self.config.window_field.data_type() else { + bail!("expected window field to be a struct"); + }; + + let window_struct_array = StructArray::try_new( + window_fields.clone(), + vec![Arc::new(window_start_array), Arc::new(window_end_array)], + None, + )?; + + let mut columns = key_columns; + columns.insert(self.config.window_index, Arc::new(window_struct_array)); + columns.extend_from_slice(merged_batch.columns()); + + let ts_field = self + .config + .input_schema_ref + .schema + .field(self.config.input_schema_ref.timestamp_index); + append_output_timestamp_column(&mut columns, &session_results, ts_field)?; + + RecordBatch::try_new(self.config.output_schema.clone(), columns) + .context("failed to create session window output batch") + } + + #[allow(dead_code)] + fn earliest_batch_time(&self) -> Option { + self.pq_start_times + .first_key_value() + .map(|(start_time, _keys)| *start_time) + } +} + +#[async_trait] +impl Operator for SessionWindowOperator { + fn name(&self) -> &str { + "SessionWindow" + } + + async fn on_start(&mut self, _ctx: &mut TaskContext) -> Result<()> { + Ok(()) + } + + async fn process_data( + &mut self, + _input_idx: usize, + batch: RecordBatch, + ctx: &mut TaskContext, + ) -> Result> { + let watermark_time = ctx.current_watermark(); + + let filtered_batch = self.filter_batch_by_time(batch, watermark_time)?; + if filtered_batch.num_rows() == 0 { + return Ok(vec![]); + } + + let sorted_batch = self.sort_batch(&filtered_batch)?; + + self.ingest_sorted_batch(sorted_batch, watermark_time) + .await?; + + Ok(vec![]) + } + + async fn process_watermark( + &mut self, + watermark: Watermark, + _ctx: &mut TaskContext, + ) -> Result> { + let Watermark::EventTime(current_time) = watermark else { + return Ok(vec![]); + }; + + let output_batches = self.evaluate_watermark(current_time).await?; + Ok(output_batches + .into_iter() + .map(StreamOutput::Forward) + .collect()) + } + + async fn snapshot_state( + &mut self, + _barrier: CheckpointBarrier, + _ctx: &mut TaskContext, + ) -> Result<()> { + Ok(()) + } + + async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result> { + Ok(vec![]) + } +} + +// ============================================================================ +// ============================================================================ + +pub struct SessionAggregatingWindowConstructor; + +impl SessionAggregatingWindowConstructor { + pub fn with_config( + &self, + config: SessionWindowAggregateOperator, + registry: Arc, + ) -> anyhow::Result { + let window_field = Arc::new(Field::new( + config.window_field_name, + window_arrow_struct(), + true, + )); + + let receiver_hook = Arc::new(RwLock::new(None)); + + let codec = StreamingExtensionCodec { + context: StreamingDecodingContext::UnboundedBatchStream(receiver_hook.clone()), + }; + + let final_plan = PhysicalPlanNode::decode(&mut config.final_aggregation_plan.as_slice())?; + let final_execution_plan = final_plan.try_into_physical_plan( + registry.as_ref(), + &RuntimeEnvBuilder::new().build()?, + &codec, + )?; + + let input_schema: FsSchema = config + .input_schema + .ok_or_else(|| anyhow!("missing input schema"))? + .try_into()?; + + let row_converter = if input_schema.routing_keys().is_none() { + let array = Arc::new(BooleanArray::from(vec![false])); + Converter::Empty( + RowConverter::new(vec![SortField::new(DataType::Boolean)])?, + array, + ) + } else { + let key_count = input_schema.routing_keys().as_ref().unwrap().len(); + Converter::RowConverter(RowConverter::new( + input_schema + .schema + .fields() + .into_iter() + .take(key_count) + .map(|field| SortField::new(field.data_type().clone())) + .collect(), + )?) + }; + + let output_schema = build_session_output_schema( + &input_schema, + window_field.clone(), + config.window_index as usize, + final_execution_plan.schema().as_ref(), + )?; + + let session_config = Arc::new(SessionWindowConfig { + gap: Duration::from_micros(config.gap_micros), + window_field, + window_index: config.window_index as usize, + input_schema_ref: Arc::new(input_schema), + final_physical_exec: final_execution_plan, + receiver_hook, + output_schema, + }); + + Ok(SessionWindowOperator { + config: session_config, + session_states: HashMap::new(), + pq_start_times: BTreeMap::new(), + pq_watermark_actions: BTreeMap::new(), + row_converter, + }) + } +} diff --git a/src/runtime/streaming/operators/windows/sliding_aggregating_window.rs b/src/runtime/streaming/operators/windows/sliding_aggregating_window.rs new file mode 100644 index 00000000..73ba4dc9 --- /dev/null +++ b/src/runtime/streaming/operators/windows/sliding_aggregating_window.rs @@ -0,0 +1,536 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{Result, anyhow, bail}; +use arrow::compute::{partition, sort_to_indices, take}; +use arrow_array::{Array, PrimitiveArray, RecordBatch, types::TimestampNanosecondType}; +use arrow_schema::SchemaRef; +use datafusion::common::ScalarValue; +use datafusion::execution::SendableRecordBatchStream; +use datafusion::execution::context::SessionContext; +use datafusion::execution::runtime_env::RuntimeEnvBuilder; +use datafusion::physical_expr::PhysicalExpr; +use datafusion::physical_plan::ExecutionPlan; +use datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec; +use datafusion_proto::{ + physical_plan::{AsExecutionPlan, from_proto::parse_physical_expr}, + protobuf::{PhysicalExprNode, PhysicalPlanNode}, +}; +use futures::StreamExt; +use prost::Message; +use std::collections::{BTreeMap, VecDeque}; +use std::sync::{Arc, RwLock}; +use std::time::{Duration, SystemTime}; +use tokio::sync::mpsc::{UnboundedReceiver, UnboundedSender, unbounded_channel}; + +use crate::runtime::streaming::StreamOutput; +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::Operator; +use crate::runtime::streaming::factory::Registry; +use crate::sql::common::{CheckpointBarrier, FsSchema, Watermark, from_nanos, to_nanos}; +use crate::sql::physical::{StreamingDecodingContext, StreamingExtensionCodec}; +use async_trait::async_trait; +use protocol::function_stream_graph::SlidingWindowAggregateOperator; +// ============================================================================ +// ============================================================================ + +#[derive(Default, Debug)] +struct RecordBatchPane { + batches: Vec, +} + +#[derive(Debug)] +struct RecordBatchTier { + width: Duration, + start_time: Option, + panes: VecDeque, +} + +impl RecordBatchTier { + fn new(width: Duration) -> Self { + Self { + width, + start_time: None, + panes: VecDeque::new(), + } + } + + fn bin_start(&self, timestamp: SystemTime) -> SystemTime { + if self.width == Duration::ZERO { + return timestamp; + } + let nanos = to_nanos(timestamp) - (to_nanos(timestamp) % self.width.as_nanos()); + from_nanos(nanos) + } + + fn insert(&mut self, batch: RecordBatch, timestamp: SystemTime) -> Result<()> { + let bin_start = self.bin_start(timestamp); + if self.start_time.is_none() { + self.start_time = Some(bin_start); + self.panes.push_back(RecordBatchPane { + batches: vec![batch], + }); + return Ok(()); + } + + let start_time = self.start_time.unwrap(); + let bin_index = + (bin_start.duration_since(start_time)?.as_nanos() / self.width.as_nanos()) as usize; + while self.panes.len() <= bin_index { + self.panes.push_back(RecordBatchPane::default()); + } + self.panes[bin_index].batches.push(batch); + Ok(()) + } + + fn batches_for_timestamp(&self, bin_start: SystemTime) -> Result> { + if self.start_time.map(|st| st > bin_start).unwrap_or(true) { + return Ok(vec![]); + } + let bin_index = (bin_start + .duration_since(self.start_time.unwrap())? + .as_nanos() + / self.width.as_nanos()) as usize; + if self.panes.len() <= bin_index { + return Ok(vec![]); + } + Ok(self.panes[bin_index].batches.clone()) + } + + fn delete_before(&mut self, cutoff: SystemTime) -> Result<()> { + let bin_start = self.bin_start(cutoff); + if self.start_time.map(|st| st >= bin_start).unwrap_or(true) { + return Ok(()); + } + let bin_index = (bin_start + .duration_since(self.start_time.unwrap()) + .unwrap() + .as_nanos() + / self.width.as_nanos()) as usize; + + if bin_index >= self.panes.len() { + self.panes.clear(); + } else { + self.panes.drain(0..bin_index); + } + self.start_time = Some(bin_start); + Ok(()) + } +} + +#[derive(Debug)] +struct TieredRecordBatchHolder { + tier_widths: Vec, + tiers: Vec, +} + +impl TieredRecordBatchHolder { + fn new(tier_widths: Vec) -> Result { + for i in 0..tier_widths.len().saturating_sub(1) { + if !tier_widths[i + 1] + .as_nanos() + .is_multiple_of(tier_widths[i].as_nanos()) + { + bail!( + "tier width {} does not evenly divide next {}", + tier_widths[i].as_nanos(), + tier_widths[i + 1].as_nanos() + ); + } + } + let tiers = tier_widths + .iter() + .map(|w| RecordBatchTier::new(*w)) + .collect(); + Ok(Self { tier_widths, tiers }) + } + + fn insert(&mut self, batch: RecordBatch, timestamp: SystemTime) -> Result<()> { + for tier in self.tiers.iter_mut() { + tier.insert(batch.clone(), timestamp)?; + } + Ok(()) + } + + fn batches_for_interval( + &self, + interval_start: SystemTime, + interval_end: SystemTime, + ) -> Result> { + let mut batches = Vec::new(); + let mut current_tier = 0usize; + let mut current_start = interval_start; + + while current_start < interval_end { + let tier_end = current_start + self.tier_widths[current_tier]; + if tier_end > interval_end { + current_tier = current_tier.saturating_sub(1); + continue; + } + if current_tier < self.tier_widths.len() - 1 { + let next_tier = &self.tiers[current_tier + 1]; + if next_tier.bin_start(current_start) == current_start + && current_start + next_tier.width <= interval_end + { + current_tier += 1; + continue; + } + } + batches.extend(self.tiers[current_tier].batches_for_timestamp(current_start)?); + current_start += self.tier_widths[current_tier]; + } + if current_start != interval_end { + bail!( + "interval end {:?} does not match current start {:?}", + interval_end, + current_start + ); + } + Ok(batches) + } + + fn delete_before(&mut self, cutoff: SystemTime) -> Result<()> { + for tier in self.tiers.iter_mut() { + tier.delete_before(cutoff)?; + } + Ok(()) + } +} + +// ============================================================================ +// ============================================================================ + +#[derive(Default)] +struct ActiveBin { + sender: Option>, + result_stream: Option, + finished_batches: Vec, +} + +impl ActiveBin { + fn start_partial( + plan: Arc, + hook: &Arc>>>, + ) -> Result { + let (tx, rx) = unbounded_channel(); + *hook.write().unwrap() = Some(rx); + plan.reset()?; + let result_stream = plan.execute(0, SessionContext::new().task_ctx())?; + Ok(Self { + sender: Some(tx), + result_stream: Some(result_stream), + finished_batches: Vec::new(), + }) + } + + async fn close_and_drain(&mut self) -> Result<()> { + self.sender.take(); + if let Some(mut stream) = self.result_stream.take() { + while let Some(batch) = stream.next().await { + self.finished_batches.push(batch?); + } + } + Ok(()) + } +} + +// ============================================================================ +// ============================================================================ + +pub struct SlidingWindowOperator { + slide: Duration, + width: Duration, + binning_function: Arc, + + partial_aggregation_plan: Arc, + partial_schema: FsSchema, + + finish_execution_plan: Arc, + final_projection: Arc, + projection_input_schema: SchemaRef, + + receiver_hook: Arc>>>, + final_batches_passer: Arc>>, + + active_bins: BTreeMap, + tiered_record_batches: TieredRecordBatchHolder, +} + +impl SlidingWindowOperator { + fn bin_start(&self, timestamp: SystemTime) -> SystemTime { + if self.slide == Duration::ZERO { + return timestamp; + } + let nanos = to_nanos(timestamp) - (to_nanos(timestamp) % self.slide.as_nanos()); + from_nanos(nanos) + } + + fn add_bin_start_as_timestamp( + batch: &RecordBatch, + bin_start: SystemTime, + schema: SchemaRef, + ) -> Result { + let bin_start_scalar = + ScalarValue::TimestampNanosecond(Some(to_nanos(bin_start) as i64), None); + let timestamp_array = bin_start_scalar.to_array_of_size(batch.num_rows())?; + let mut columns = batch.columns().to_vec(); + columns.push(timestamp_array); + Ok(RecordBatch::try_new(schema, columns)?) + } + + fn ensure_bin_running( + slot: &mut ActiveBin, + plan: Arc, + hook: &Arc>>>, + ) -> Result<()> { + if slot.sender.is_some() { + return Ok(()); + } + let preserved = std::mem::take(&mut slot.finished_batches); + let mut started = ActiveBin::start_partial(plan, hook)?; + started.finished_batches = preserved; + *slot = started; + Ok(()) + } +} + +#[async_trait] +impl Operator for SlidingWindowOperator { + fn name(&self) -> &str { + "SlidingWindow" + } + + async fn on_start(&mut self, _ctx: &mut TaskContext) -> Result<()> { + Ok(()) + } + + async fn process_data( + &mut self, + _input_idx: usize, + batch: RecordBatch, + ctx: &mut TaskContext, + ) -> Result> { + let bin_array = self + .binning_function + .evaluate(&batch)? + .into_array(batch.num_rows())?; + let indices = sort_to_indices(bin_array.as_ref(), None, None)?; + + let columns = batch + .columns() + .iter() + .map(|c| take(c, &indices, None).unwrap()) + .collect(); + let sorted = RecordBatch::try_new(batch.schema(), columns)?; + let sorted_bins = take(bin_array.as_ref(), &indices, None)?; + + let typed_bin = sorted_bins + .as_any() + .downcast_ref::>() + .ok_or_else(|| anyhow!("binning function must produce TimestampNanosecond"))?; + let partition_ranges = partition(std::slice::from_ref(&sorted_bins))?.ranges(); + + let watermark = ctx.current_watermark(); + + for range in partition_ranges { + let bin_start = from_nanos(typed_bin.value(range.start) as u128); + + if let Some(wm) = watermark + && bin_start < self.bin_start(wm) + { + continue; + } + + let bin_batch = sorted.slice(range.start, range.end - range.start); + let slot = self.active_bins.entry(bin_start).or_default(); + + Self::ensure_bin_running( + slot, + self.partial_aggregation_plan.clone(), + &self.receiver_hook, + )?; + + let sender = slot + .sender + .as_ref() + .ok_or_else(|| anyhow!("partial bin sender missing after ensure"))?; + sender + .send(bin_batch) + .map_err(|e| anyhow!("partial channel send: {e}"))?; + } + + Ok(vec![]) + } + + async fn process_watermark( + &mut self, + watermark: Watermark, + _ctx: &mut TaskContext, + ) -> Result> { + let Watermark::EventTime(current_time) = watermark else { + return Ok(vec![]); + }; + let watermark_bin = self.bin_start(current_time); + + let mut final_outputs = Vec::new(); + + let mut expired_bins = Vec::new(); + for &k in self.active_bins.keys() { + if k + self.slide <= watermark_bin { + expired_bins.push(k); + } else { + break; + } + } + + for bin_start in expired_bins { + let mut bin = self + .active_bins + .remove(&bin_start) + .ok_or_else(|| anyhow!("missing active bin"))?; + let bin_end = bin_start + self.slide; + + bin.close_and_drain().await?; + for b in bin.finished_batches { + self.tiered_record_batches.insert(b, bin_start)?; + } + + let interval_start = bin_end - self.width; + let interval_end = bin_end; + + let partials = self + .tiered_record_batches + .batches_for_interval(interval_start, interval_end)?; + *self.final_batches_passer.write().unwrap() = partials; + + self.finish_execution_plan.reset()?; + let mut final_exec = self + .finish_execution_plan + .execute(0, SessionContext::new().task_ctx())?; + + let mut aggregate_results = Vec::new(); + while let Some(batch) = final_exec.next().await { + aggregate_results.push(Self::add_bin_start_as_timestamp( + &batch?, + interval_start, + self.projection_input_schema.clone(), + )?); + } + + *self.final_batches_passer.write().unwrap() = aggregate_results; + self.final_projection.reset()?; + let mut proj_exec = self + .final_projection + .execute(0, SessionContext::new().task_ctx())?; + + while let Some(batch) = proj_exec.next().await { + final_outputs.push(StreamOutput::Forward(batch?)); + } + + self.tiered_record_batches + .delete_before(bin_end + self.slide - self.width)?; + } + + Ok(final_outputs) + } + + async fn snapshot_state( + &mut self, + _barrier: CheckpointBarrier, + _ctx: &mut TaskContext, + ) -> Result<()> { + Ok(()) + } + + async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result> { + Ok(vec![]) + } +} + +// ============================================================================ +// ============================================================================ + +pub struct SlidingAggregatingWindowConstructor; + +impl SlidingAggregatingWindowConstructor { + pub fn with_config( + &self, + config: SlidingWindowAggregateOperator, + registry: Arc, + ) -> anyhow::Result { + let width = Duration::from_micros(config.width_micros); + let slide = Duration::from_micros(config.slide_micros); + let input_schema: FsSchema = config + .input_schema + .ok_or_else(|| anyhow!("missing input schema"))? + .try_into()?; + + let binning_function = parse_physical_expr( + &PhysicalExprNode::decode(&mut config.binning_function.as_slice())?, + registry.as_ref(), + &input_schema.schema, + &DefaultPhysicalExtensionCodec {}, + )?; + + let receiver_hook = Arc::new(RwLock::new(None)); + let final_batches_passer = Arc::new(RwLock::new(Vec::new())); + + let codec = StreamingExtensionCodec { + context: StreamingDecodingContext::UnboundedBatchStream(receiver_hook.clone()), + }; + let final_codec = StreamingExtensionCodec { + context: StreamingDecodingContext::LockedBatchVec(final_batches_passer.clone()), + }; + + let partial_plan = + PhysicalPlanNode::decode(&mut config.partial_aggregation_plan.as_slice())? + .try_into_physical_plan( + registry.as_ref(), + &RuntimeEnvBuilder::new().build()?, + &codec, + )?; + + let finish_plan = PhysicalPlanNode::decode(&mut config.final_aggregation_plan.as_slice())? + .try_into_physical_plan( + registry.as_ref(), + &RuntimeEnvBuilder::new().build()?, + &final_codec, + )?; + + let final_proj = PhysicalPlanNode::decode(&mut config.final_projection.as_slice())? + .try_into_physical_plan( + registry.as_ref(), + &RuntimeEnvBuilder::new().build()?, + &final_codec, + )?; + + let partial_schema: FsSchema = config + .partial_schema + .ok_or_else(|| anyhow!("missing partial schema"))? + .try_into()?; + + Ok(SlidingWindowOperator { + slide, + width, + binning_function, + partial_aggregation_plan: partial_plan, + partial_schema, + finish_execution_plan: finish_plan, + final_projection: final_proj.clone(), + projection_input_schema: final_proj.children()[0].schema().clone(), + receiver_hook, + final_batches_passer, + active_bins: BTreeMap::new(), + tiered_record_batches: TieredRecordBatchHolder::new(vec![slide])?, + }) + } +} diff --git a/src/runtime/streaming/operators/windows/tumbling_aggregating_window.rs b/src/runtime/streaming/operators/windows/tumbling_aggregating_window.rs new file mode 100644 index 00000000..de576bf0 --- /dev/null +++ b/src/runtime/streaming/operators/windows/tumbling_aggregating_window.rs @@ -0,0 +1,372 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{Result, anyhow}; +use arrow::compute::{partition, sort_to_indices, take}; +use arrow_array::{Array, PrimitiveArray, RecordBatch, types::TimestampNanosecondType}; +use arrow_schema::SchemaRef; +use datafusion::common::ScalarValue; +use datafusion::execution::SendableRecordBatchStream; +use datafusion::execution::context::SessionContext; +use datafusion::execution::runtime_env::RuntimeEnvBuilder; +use datafusion::physical_expr::PhysicalExpr; +use datafusion::physical_plan::ExecutionPlan; +use datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec; +use datafusion_proto::{ + physical_plan::{AsExecutionPlan, from_proto::parse_physical_expr}, + protobuf::{PhysicalExprNode, PhysicalPlanNode}, +}; +use futures::StreamExt; +use prost::Message; +use std::collections::BTreeMap; +use std::mem; +use std::sync::{Arc, RwLock}; +use std::time::{Duration, SystemTime}; +use tokio::sync::mpsc::{UnboundedReceiver, UnboundedSender, unbounded_channel}; +use tracing::warn; + +use crate::runtime::streaming::StreamOutput; +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::Operator; +use crate::runtime::streaming::factory::Registry; +use crate::sql::common::time_utils::print_time; +use crate::sql::common::{CheckpointBarrier, FsSchema, Watermark, from_nanos, to_nanos}; +use crate::sql::physical::{StreamingDecodingContext, StreamingExtensionCodec}; +use crate::sql::schema::utils::add_timestamp_field_arrow; +use async_trait::async_trait; +use protocol::function_stream_graph::TumblingWindowAggregateOperator; + +#[derive(Default)] +struct ActiveBin { + sender: Option>, + result_stream: Option, + finished_batches: Vec, +} + +impl ActiveBin { + fn start_partial( + plan: Arc, + hook: &Arc>>>, + ) -> Result { + let (tx, rx) = unbounded_channel(); + *hook.write().unwrap() = Some(rx); + plan.reset()?; + let result_stream = plan.execute(0, SessionContext::new().task_ctx())?; + Ok(Self { + sender: Some(tx), + result_stream: Some(result_stream), + finished_batches: Vec::new(), + }) + } + + async fn close_and_drain(&mut self) -> Result<()> { + self.sender.take(); + if let Some(mut stream) = self.result_stream.take() { + while let Some(batch) = stream.next().await { + self.finished_batches.push(batch?); + } + } + Ok(()) + } +} + +pub struct TumblingWindowOperator { + width: Duration, + binning_function: Arc, + + partial_aggregation_plan: Arc, + partial_schema: FsSchema, + + finish_execution_plan: Arc, + aggregate_with_timestamp_schema: SchemaRef, + final_projection: Option>, + + receiver_hook: Arc>>>, + final_batches_passer: Arc>>, + + active_bins: BTreeMap, +} + +impl TumblingWindowOperator { + fn bin_start(&self, timestamp: SystemTime) -> SystemTime { + if self.width == Duration::ZERO { + return timestamp; + } + let nanos = to_nanos(timestamp) - (to_nanos(timestamp) % self.width.as_nanos()); + from_nanos(nanos) + } + + fn add_bin_start_as_timestamp( + batch: &RecordBatch, + bin_start: SystemTime, + schema: SchemaRef, + ) -> Result { + let bin_start_scalar = + ScalarValue::TimestampNanosecond(Some(to_nanos(bin_start) as i64), None); + let timestamp_array = bin_start_scalar.to_array_of_size(batch.num_rows())?; + let mut columns = batch.columns().to_vec(); + columns.push(timestamp_array); + RecordBatch::try_new(schema.clone(), columns) + .map_err(|e| anyhow!("add _timestamp column: {e}")) + } + + fn ensure_bin_running( + slot: &mut ActiveBin, + plan: Arc, + hook: &Arc>>>, + ) -> Result<()> { + if slot.sender.is_some() { + return Ok(()); + } + let preserved = mem::take(&mut slot.finished_batches); + let mut started = ActiveBin::start_partial(plan, hook)?; + started.finished_batches = preserved; + *slot = started; + Ok(()) + } +} + +#[async_trait] +impl Operator for TumblingWindowOperator { + fn name(&self) -> &str { + "TumblingWindow" + } + + async fn on_start(&mut self, _ctx: &mut TaskContext) -> Result<()> { + Ok(()) + } + + async fn process_data( + &mut self, + _input_idx: usize, + batch: RecordBatch, + ctx: &mut TaskContext, + ) -> Result> { + let bin_array = self + .binning_function + .evaluate(&batch)? + .into_array(batch.num_rows())?; + let indices = sort_to_indices(bin_array.as_ref(), None, None)?; + + let columns = batch + .columns() + .iter() + .map(|c| take(c, &indices, None).unwrap()) + .collect(); + let sorted = RecordBatch::try_new(batch.schema(), columns)?; + let sorted_bins = take(bin_array.as_ref(), &indices, None)?; + + let typed_bin = sorted_bins + .as_any() + .downcast_ref::>() + .ok_or_else(|| anyhow!("binning function must produce TimestampNanosecond"))?; + let partition_ranges = partition(std::slice::from_ref(&sorted_bins))?.ranges(); + + for range in partition_ranges { + let bin_start = from_nanos(typed_bin.value(range.start) as u128); + + if let Some(watermark) = ctx.current_watermark() + && bin_start < self.bin_start(watermark) + { + warn!( + "late data dropped: bin {} < watermark {}", + print_time(bin_start), + print_time(watermark) + ); + continue; + } + + let bin_batch = sorted.slice(range.start, range.end - range.start); + let slot = self.active_bins.entry(bin_start).or_default(); + + Self::ensure_bin_running( + slot, + self.partial_aggregation_plan.clone(), + &self.receiver_hook, + )?; + + let sender = slot + .sender + .as_ref() + .ok_or_else(|| anyhow!("tumbling bin sender missing after ensure"))?; + sender + .send(bin_batch) + .map_err(|e| anyhow!("partial channel send: {e}"))?; + } + + Ok(vec![]) + } + + async fn process_watermark( + &mut self, + watermark: Watermark, + _ctx: &mut TaskContext, + ) -> Result> { + let Watermark::EventTime(current_time) = watermark else { + return Ok(vec![]); + }; + + let mut final_outputs = Vec::new(); + + let mut expired_bins = Vec::new(); + for &k in self.active_bins.keys() { + if k + self.width <= current_time { + expired_bins.push(k); + } else { + break; + } + } + + for bin_start in expired_bins { + let mut bin = self + .active_bins + .remove(&bin_start) + .ok_or_else(|| anyhow!("missing tumbling bin"))?; + + bin.close_and_drain().await?; + let partial_batches = mem::take(&mut bin.finished_batches); + + if partial_batches.is_empty() { + continue; + } + + *self.final_batches_passer.write().unwrap() = partial_batches; + self.finish_execution_plan.reset()?; + let mut final_exec = self + .finish_execution_plan + .execute(0, SessionContext::new().task_ctx())?; + + let mut aggregate_results = Vec::new(); + while let Some(batch) = final_exec.next().await { + let batch = batch?; + let with_timestamp = Self::add_bin_start_as_timestamp( + &batch, + bin_start, + self.aggregate_with_timestamp_schema.clone(), + )?; + + if self.final_projection.is_none() { + final_outputs.push(StreamOutput::Forward(with_timestamp)); + } else { + aggregate_results.push(with_timestamp); + } + } + + if let Some(final_projection) = &self.final_projection { + *self.final_batches_passer.write().unwrap() = aggregate_results; + final_projection.reset()?; + let mut proj_exec = + final_projection.execute(0, SessionContext::new().task_ctx())?; + + while let Some(batch) = proj_exec.next().await { + final_outputs.push(StreamOutput::Forward(batch?)); + } + } + } + + Ok(final_outputs) + } + + async fn snapshot_state( + &mut self, + _barrier: CheckpointBarrier, + _ctx: &mut TaskContext, + ) -> Result<()> { + Ok(()) + } + + async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result> { + Ok(vec![]) + } +} + +pub struct TumblingAggregateWindowConstructor; + +impl TumblingAggregateWindowConstructor { + pub fn with_config( + &self, + config: TumblingWindowAggregateOperator, + registry: Arc, + ) -> anyhow::Result { + let width = Duration::from_micros(config.width_micros); + let input_schema: FsSchema = config + .input_schema + .ok_or_else(|| anyhow!("missing input schema"))? + .try_into()?; + + let binning_function = parse_physical_expr( + &PhysicalExprNode::decode(&mut config.binning_function.as_slice())?, + registry.as_ref(), + &input_schema.schema, + &DefaultPhysicalExtensionCodec {}, + )?; + + let receiver_hook = Arc::new(RwLock::new(None)); + let final_batches_passer = Arc::new(RwLock::new(Vec::new())); + + let codec = StreamingExtensionCodec { + context: StreamingDecodingContext::UnboundedBatchStream(receiver_hook.clone()), + }; + let final_codec = StreamingExtensionCodec { + context: StreamingDecodingContext::LockedBatchVec(final_batches_passer.clone()), + }; + + let partial_plan = + PhysicalPlanNode::decode(&mut config.partial_aggregation_plan.as_slice())? + .try_into_physical_plan( + registry.as_ref(), + &RuntimeEnvBuilder::new().build()?, + &codec, + )?; + + let partial_schema: FsSchema = config + .partial_schema + .ok_or_else(|| anyhow!("missing partial schema"))? + .try_into()?; + + let finish_plan = PhysicalPlanNode::decode(&mut config.final_aggregation_plan.as_slice())?; + let finish_execution_plan = finish_plan.try_into_physical_plan( + registry.as_ref(), + &RuntimeEnvBuilder::new().build()?, + &final_codec, + )?; + + let final_projection_plan = match &config.final_projection { + Some(proto) if !proto.is_empty() => { + let node = PhysicalPlanNode::decode(&mut proto.as_slice()) + .map_err(|e| anyhow!("decode final_projection: {e}"))?; + Some(node.try_into_physical_plan( + registry.as_ref(), + &RuntimeEnvBuilder::new().build()?, + &final_codec, + )?) + } + _ => None, + }; + + let aggregate_with_timestamp_schema = + add_timestamp_field_arrow((*finish_execution_plan.schema()).clone()); + + Ok(TumblingWindowOperator { + width, + binning_function, + partial_aggregation_plan: partial_plan, + partial_schema, + finish_execution_plan, + aggregate_with_timestamp_schema, + final_projection: final_projection_plan, + receiver_hook, + final_batches_passer, + active_bins: BTreeMap::new(), + }) + } +} diff --git a/src/runtime/streaming/operators/windows/window_function.rs b/src/runtime/streaming/operators/windows/window_function.rs new file mode 100644 index 00000000..5e340fec --- /dev/null +++ b/src/runtime/streaming/operators/windows/window_function.rs @@ -0,0 +1,281 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{Result, anyhow}; +use arrow::compute::{max, min}; +use arrow_array::RecordBatch; +use datafusion::execution::SendableRecordBatchStream; +use datafusion::execution::context::SessionContext; +use datafusion::execution::runtime_env::RuntimeEnvBuilder; +use datafusion::physical_plan::ExecutionPlan; +use datafusion_proto::physical_plan::AsExecutionPlan; +use datafusion_proto::protobuf::PhysicalPlanNode; +use futures::StreamExt; +use prost::Message; +use std::collections::BTreeMap; +use std::sync::{Arc, RwLock}; +use std::time::SystemTime; +use tokio::sync::mpsc::{UnboundedReceiver, UnboundedSender, unbounded_channel}; +use tracing::warn; + +use crate::runtime::streaming::StreamOutput; +use crate::runtime::streaming::api::context::TaskContext; +use crate::runtime::streaming::api::operator::Operator; +use crate::runtime::streaming::factory::Registry; +use crate::sql::common::time_utils::print_time; +use crate::sql::common::{CheckpointBarrier, FsSchema, FsSchemaRef, Watermark, from_nanos}; +use crate::sql::physical::{StreamingDecodingContext, StreamingExtensionCodec}; +use async_trait::async_trait; + +// ============================================================================ +// ============================================================================ + +struct ActiveWindowExec { + sender: Option>, + result_stream: Option, +} + +impl ActiveWindowExec { + fn new( + plan: Arc, + hook: &Arc>>>, + ) -> Result { + let (tx, rx) = unbounded_channel(); + *hook.write().unwrap() = Some(rx); + plan.reset()?; + let result_stream = plan.execute(0, SessionContext::new().task_ctx())?; + Ok(Self { + sender: Some(tx), + result_stream: Some(result_stream), + }) + } + + async fn close_and_drain(&mut self) -> Result> { + self.sender.take(); + let mut results = Vec::new(); + if let Some(mut stream) = self.result_stream.take() { + while let Some(batch) = stream.next().await { + results.push(batch?); + } + } + Ok(results) + } +} + +// ============================================================================ +// ============================================================================ + +pub struct WindowFunctionOperator { + input_schema: FsSchemaRef, + input_schema_unkeyed: FsSchemaRef, + window_exec_plan: Arc, + receiver_hook: Arc>>>, + active_execs: BTreeMap, +} + +impl WindowFunctionOperator { + fn filter_and_split_batches( + &self, + batch: RecordBatch, + watermark: Option, + ) -> Result> { + if batch.num_rows() == 0 { + return Ok(vec![]); + } + + let timestamp_column = self.input_schema.timestamp_column(&batch); + let min_timestamp = from_nanos(min(timestamp_column).unwrap() as u128); + let max_timestamp = from_nanos(max(timestamp_column).unwrap() as u128); + + if let Some(wm) = watermark + && max_timestamp < wm + { + warn!( + "dropped late batch: max_ts {} < watermark {}", + print_time(max_timestamp), + print_time(wm) + ); + return Ok(vec![]); + } + + if min_timestamp == max_timestamp { + return Ok(vec![(batch, max_timestamp)]); + } + + let sorted_batch = self + .input_schema_unkeyed + .sort(batch, true) + .map_err(|e| anyhow!("sort for window fn: {e}"))?; + let filtered_batch = self + .input_schema_unkeyed + .filter_by_time(sorted_batch, watermark) + .map_err(|e| anyhow!("filter_by_time: {e}"))?; + if filtered_batch.num_rows() == 0 { + return Ok(vec![]); + } + + let filtered_timestamps = self.input_schema.timestamp_column(&filtered_batch); + let ranges = self + .input_schema_unkeyed + .partition(&filtered_batch, true) + .map_err(|e| anyhow!("partition by time: {e}"))?; + + let mut batches = Vec::with_capacity(ranges.len()); + for range in ranges { + let slice = filtered_batch.slice(range.start, range.end - range.start); + let ts = from_nanos(filtered_timestamps.value(range.start) as u128); + batches.push((slice, ts)); + } + Ok(batches) + } + + fn get_or_create_exec(&mut self, timestamp: SystemTime) -> Result<&mut ActiveWindowExec> { + use std::collections::btree_map::Entry; + match self.active_execs.entry(timestamp) { + Entry::Vacant(v) => { + let new_exec = + ActiveWindowExec::new(self.window_exec_plan.clone(), &self.receiver_hook)?; + Ok(v.insert(new_exec)) + } + Entry::Occupied(o) => Ok(o.into_mut()), + } + } +} + +#[async_trait] +impl Operator for WindowFunctionOperator { + fn name(&self) -> &str { + "WindowFunction" + } + + async fn on_start(&mut self, _ctx: &mut TaskContext) -> Result<()> { + Ok(()) + } + + async fn process_data( + &mut self, + _input_idx: usize, + batch: RecordBatch, + ctx: &mut TaskContext, + ) -> Result> { + let current_watermark = ctx.current_watermark(); + let split_batches = self.filter_and_split_batches(batch, current_watermark)?; + + for (sub_batch, timestamp) in split_batches { + let exec = self.get_or_create_exec(timestamp)?; + exec.sender + .as_ref() + .ok_or_else(|| anyhow!("window exec sender missing"))? + .send(sub_batch) + .map_err(|e| anyhow!("route batch to plan: {e}"))?; + } + + Ok(vec![]) + } + + async fn process_watermark( + &mut self, + watermark: Watermark, + _ctx: &mut TaskContext, + ) -> Result> { + let Watermark::EventTime(current_time) = watermark else { + return Ok(vec![]); + }; + + let mut final_outputs = Vec::new(); + + let mut expired_timestamps = Vec::new(); + for &k in self.active_execs.keys() { + if k < current_time { + expired_timestamps.push(k); + } else { + break; + } + } + + for ts in expired_timestamps { + let mut exec = self + .active_execs + .remove(&ts) + .ok_or_else(|| anyhow!("missing window exec"))?; + let result_batches = exec.close_and_drain().await?; + for batch in result_batches { + final_outputs.push(StreamOutput::Forward(batch)); + } + } + + Ok(final_outputs) + } + + async fn snapshot_state( + &mut self, + _barrier: CheckpointBarrier, + _ctx: &mut TaskContext, + ) -> Result<()> { + Ok(()) + } + + async fn on_close(&mut self, _ctx: &mut TaskContext) -> Result> { + Ok(vec![]) + } +} + +// ============================================================================ +// ============================================================================ + +pub struct WindowFunctionConstructor; + +impl WindowFunctionConstructor { + pub fn with_config( + &self, + config: protocol::function_stream_graph::WindowFunctionOperator, + registry: Arc, + ) -> anyhow::Result { + let input_schema = Arc::new( + FsSchema::try_from( + config + .input_schema + .ok_or_else(|| anyhow!("missing input schema"))?, + ) + .map_err(|e| anyhow!("input schema: {e}"))?, + ); + + let input_schema_unkeyed = Arc::new( + FsSchema::from_schema_unkeyed(input_schema.schema.clone()) + .map_err(|e| anyhow!("unkeyed schema: {e}"))?, + ); + + let receiver_hook = Arc::new(RwLock::new(None)); + let codec = StreamingExtensionCodec { + context: StreamingDecodingContext::UnboundedBatchStream(receiver_hook.clone()), + }; + + let window_exec_node = + PhysicalPlanNode::decode(&mut config.window_function_plan.as_slice()) + .map_err(|e| anyhow!("decode window_function_plan: {e}"))?; + let window_exec_plan = window_exec_node + .try_into_physical_plan( + registry.as_ref(), + &RuntimeEnvBuilder::new().build()?, + &codec, + ) + .map_err(|e| anyhow!("window physical plan: {e}"))?; + + Ok(WindowFunctionOperator { + input_schema, + input_schema_unkeyed, + window_exec_plan, + receiver_hook, + active_execs: BTreeMap::new(), + }) + } +} diff --git a/src/runtime/streaming/protocol/control.rs b/src/runtime/streaming/protocol/control.rs new file mode 100644 index 00000000..3b23cb09 --- /dev/null +++ b/src/runtime/streaming/protocol/control.rs @@ -0,0 +1,81 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::event::CheckpointBarrier; +use serde::{Deserialize, Serialize}; +use std::time::Duration; +use tokio::sync::mpsc::{self, Receiver, Sender}; + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct CheckpointBarrierWire { + pub epoch: u32, + pub min_epoch: u32, + pub timestamp_secs: u64, + pub timestamp_subsec_nanos: u32, + pub then_stop: bool, +} + +impl From for CheckpointBarrierWire { + fn from(b: CheckpointBarrier) -> Self { + let d = b + .timestamp + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default(); + Self { + epoch: b.epoch, + min_epoch: b.min_epoch, + timestamp_secs: d.as_secs(), + timestamp_subsec_nanos: d.subsec_nanos(), + then_stop: b.then_stop, + } + } +} + +impl From for CheckpointBarrier { + fn from(w: CheckpointBarrierWire) -> Self { + Self { + epoch: w.epoch, + min_epoch: w.min_epoch, + timestamp: std::time::UNIX_EPOCH + + Duration::new(w.timestamp_secs, w.timestamp_subsec_nanos), + then_stop: w.then_stop, + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum ControlCommand { + Start, + Stop { mode: StopMode }, + DropState, + Commit { epoch: u32 }, + UpdateConfig { config_json: String }, + TriggerCheckpoint { barrier: CheckpointBarrierWire }, +} + +impl ControlCommand { + pub fn trigger_checkpoint(barrier: CheckpointBarrier) -> Self { + Self::TriggerCheckpoint { + barrier: barrier.into(), + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum StopMode { + Graceful, + Immediate, +} + +pub fn control_channel(capacity: usize) -> (Sender, Receiver) { + mpsc::channel(capacity) +} diff --git a/src/runtime/streaming/protocol/event.rs b/src/runtime/streaming/protocol/event.rs new file mode 100644 index 00000000..823035f8 --- /dev/null +++ b/src/runtime/streaming/protocol/event.rs @@ -0,0 +1,146 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use bincode::{Decode, Encode}; +use serde::{Deserialize, Serialize}; +use std::sync::Arc; +use std::time::SystemTime; + +use arrow_array::RecordBatch; + +use crate::runtime::streaming::memory::MemoryTicket; + +#[derive(Debug, Copy, Clone, PartialEq, Eq, Encode, Decode, Serialize, Deserialize)] +pub enum Watermark { + EventTime(SystemTime), + Idle, +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq, Encode, Decode, Serialize, Deserialize)] +pub struct CheckpointBarrier { + pub epoch: u32, + pub min_epoch: u32, + pub timestamp: SystemTime, + pub then_stop: bool, +} + +#[derive(Debug, Clone)] +pub enum StreamEvent { + Data(RecordBatch), + Watermark(Watermark), + Barrier(CheckpointBarrier), + EndOfStream, +} + +#[derive(Debug, Clone)] +pub enum StreamOutput { + Forward(RecordBatch), + Keyed(u64, RecordBatch), + Broadcast(RecordBatch), + Watermark(Watermark), +} + +#[derive(Debug, Clone)] +pub struct TrackedEvent { + pub event: StreamEvent, + pub _ticket: Option>, +} + +impl TrackedEvent { + pub fn new(event: StreamEvent, ticket: Option) -> Self { + Self { + event, + _ticket: ticket.map(Arc::new), + } + } + + pub fn control(event: StreamEvent) -> Self { + Self { + event, + _ticket: None, + } + } +} + +pub fn merge_watermarks(per_input: &[Option]) -> Option { + if per_input.iter().any(|w| w.is_none()) { + return None; + } + + let mut min_event: Option = None; + let mut all_idle = true; + + for w in per_input.iter().flatten() { + match w { + Watermark::Idle => {} + Watermark::EventTime(t) => { + all_idle = false; + min_event = Some(match min_event { + None => *t, + Some(m) => m.min(*t), + }); + } + } + } + + if all_idle { + Some(Watermark::Idle) + } else { + Some(Watermark::EventTime(min_event.expect( + "non-idle alignment must have at least one EventTime", + ))) + } +} + +pub fn watermark_strictly_advances(new: Watermark, previous: Option) -> bool { + match previous { + None => true, + Some(prev) => match (new, prev) { + (Watermark::EventTime(tn), Watermark::EventTime(tp)) => tn > tp, + (Watermark::Idle, Watermark::Idle) => false, + (Watermark::Idle, Watermark::EventTime(_)) => true, + (Watermark::EventTime(_), Watermark::Idle) => true, + }, + } +} + +#[cfg(test)] +mod watermark_tests { + use super::*; + use std::time::Duration; + + #[test] + fn merge_waits_for_all_channels() { + let wms = vec![Some(Watermark::EventTime(SystemTime::UNIX_EPOCH)), None]; + assert!(merge_watermarks(&wms).is_none()); + } + + #[test] + fn merge_min_event_time_ignores_idle() { + let t1 = SystemTime::UNIX_EPOCH + Duration::from_secs(10); + let t2 = SystemTime::UNIX_EPOCH + Duration::from_secs(5); + let wms = vec![Some(Watermark::EventTime(t1)), Some(Watermark::Idle)]; + assert_eq!(merge_watermarks(&wms), Some(Watermark::EventTime(t1))); + + let wms = vec![ + Some(Watermark::EventTime(t1)), + Some(Watermark::EventTime(t2)), + ]; + assert_eq!(merge_watermarks(&wms), Some(Watermark::EventTime(t2))); + } + + #[test] + fn merge_all_idle() { + let wms = vec![Some(Watermark::Idle), Some(Watermark::Idle)]; + assert_eq!(merge_watermarks(&wms), Some(Watermark::Idle)); + } +} diff --git a/src/runtime/streaming/protocol/mod.rs b/src/runtime/streaming/protocol/mod.rs new file mode 100644 index 00000000..e91e8d8c --- /dev/null +++ b/src/runtime/streaming/protocol/mod.rs @@ -0,0 +1,16 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod control; +pub mod event; + +pub use event::{CheckpointBarrier, StreamOutput, Watermark}; diff --git a/src/runtime/util/mod.rs b/src/runtime/util/mod.rs new file mode 100644 index 00000000..60eba772 --- /dev/null +++ b/src/runtime/util/mod.rs @@ -0,0 +1,15 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod physical_aggregate; + +pub use physical_aggregate::decode_aggregate; diff --git a/src/runtime/util/physical_aggregate.rs b/src/runtime/util/physical_aggregate.rs new file mode 100644 index 00000000..ece3f772 --- /dev/null +++ b/src/runtime/util/physical_aggregate.rs @@ -0,0 +1,76 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use arrow::datatypes::SchemaRef; +use datafusion::common::Result as DFResult; +use datafusion::common::internal_err; +use datafusion::execution::FunctionRegistry; +use datafusion::physical_expr::aggregate::{AggregateExprBuilder, AggregateFunctionExpr}; +use datafusion::physical_expr::{LexOrdering, PhysicalExpr}; +use datafusion_proto::physical_plan::from_proto::{parse_physical_expr, parse_physical_sort_expr}; +use datafusion_proto::physical_plan::{DefaultPhysicalExtensionCodec, PhysicalExtensionCodec}; +use datafusion_proto::protobuf::physical_aggregate_expr_node::AggregateFunction; +use datafusion_proto::protobuf::physical_expr_node::ExprType; +use datafusion_proto::protobuf::{PhysicalExprNode, proto_error}; + +pub fn decode_aggregate( + schema: &SchemaRef, + name: &str, + expr: &PhysicalExprNode, + registry: &dyn FunctionRegistry, +) -> DFResult> { + let codec = &DefaultPhysicalExtensionCodec {}; + let expr_type = expr + .expr_type + .as_ref() + .ok_or_else(|| proto_error("Unexpected empty aggregate physical expression"))?; + + match expr_type { + ExprType::AggregateExpr(agg_node) => { + let input_phy_expr: Vec> = agg_node + .expr + .iter() + .map(|e| parse_physical_expr(e, registry, schema, codec)) + .collect::>>()?; + let ordering_req: LexOrdering = agg_node + .ordering_req + .iter() + .map(|e| parse_physical_sort_expr(e, registry, schema, codec)) + .collect::>()?; + agg_node + .aggregate_function + .as_ref() + .map(|func| match func { + AggregateFunction::UserDefinedAggrFunction(udaf_name) => { + let agg_udf = match &agg_node.fun_definition { + Some(buf) => codec.try_decode_udaf(udaf_name, buf)?, + None => registry.udaf(udaf_name)?, + }; + + AggregateExprBuilder::new(agg_udf, input_phy_expr) + .schema(Arc::clone(schema)) + .alias(name) + .with_ignore_nulls(agg_node.ignore_nulls) + .with_distinct(agg_node.distinct) + .order_by(ordering_req) + .build() + .map(Arc::new) + } + }) + .transpose()? + .ok_or_else(|| proto_error("Invalid AggregateExpr, missing aggregate_function")) + } + _ => internal_err!("Invalid aggregate expression for AggregateExec"), + } +} diff --git a/src/runtime/input/input_protocol.rs b/src/runtime/wasm/input/input_protocol.rs similarity index 100% rename from src/runtime/input/input_protocol.rs rename to src/runtime/wasm/input/input_protocol.rs diff --git a/src/runtime/input/input_provider.rs b/src/runtime/wasm/input/input_provider.rs similarity index 100% rename from src/runtime/input/input_provider.rs rename to src/runtime/wasm/input/input_provider.rs diff --git a/src/runtime/input/input_runner.rs b/src/runtime/wasm/input/input_runner.rs similarity index 100% rename from src/runtime/input/input_runner.rs rename to src/runtime/wasm/input/input_runner.rs diff --git a/src/runtime/input/interface.rs b/src/runtime/wasm/input/interface.rs similarity index 100% rename from src/runtime/input/interface.rs rename to src/runtime/wasm/input/interface.rs diff --git a/src/runtime/input/mod.rs b/src/runtime/wasm/input/mod.rs similarity index 100% rename from src/runtime/input/mod.rs rename to src/runtime/wasm/input/mod.rs diff --git a/src/runtime/input/protocol/kafka/config.rs b/src/runtime/wasm/input/protocol/kafka/config.rs similarity index 100% rename from src/runtime/input/protocol/kafka/config.rs rename to src/runtime/wasm/input/protocol/kafka/config.rs diff --git a/src/runtime/input/protocol/kafka/kafka_protocol.rs b/src/runtime/wasm/input/protocol/kafka/kafka_protocol.rs similarity index 100% rename from src/runtime/input/protocol/kafka/kafka_protocol.rs rename to src/runtime/wasm/input/protocol/kafka/kafka_protocol.rs diff --git a/src/runtime/input/protocol/kafka/mod.rs b/src/runtime/wasm/input/protocol/kafka/mod.rs similarity index 100% rename from src/runtime/input/protocol/kafka/mod.rs rename to src/runtime/wasm/input/protocol/kafka/mod.rs diff --git a/src/runtime/wasm/input/protocol/mod.rs b/src/runtime/wasm/input/protocol/mod.rs new file mode 100644 index 00000000..b9574391 --- /dev/null +++ b/src/runtime/wasm/input/protocol/mod.rs @@ -0,0 +1,13 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod kafka; diff --git a/src/runtime/wasm/mod.rs b/src/runtime/wasm/mod.rs new file mode 100644 index 00000000..b1c82f4c --- /dev/null +++ b/src/runtime/wasm/mod.rs @@ -0,0 +1,18 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! WebAssembly runtime integration. + +pub mod input; +pub mod output; +pub mod processor; diff --git a/src/runtime/output/interface.rs b/src/runtime/wasm/output/interface.rs similarity index 100% rename from src/runtime/output/interface.rs rename to src/runtime/wasm/output/interface.rs diff --git a/src/runtime/output/mod.rs b/src/runtime/wasm/output/mod.rs similarity index 100% rename from src/runtime/output/mod.rs rename to src/runtime/wasm/output/mod.rs diff --git a/src/runtime/output/output_protocol.rs b/src/runtime/wasm/output/output_protocol.rs similarity index 100% rename from src/runtime/output/output_protocol.rs rename to src/runtime/wasm/output/output_protocol.rs diff --git a/src/runtime/output/output_provider.rs b/src/runtime/wasm/output/output_provider.rs similarity index 100% rename from src/runtime/output/output_provider.rs rename to src/runtime/wasm/output/output_provider.rs diff --git a/src/runtime/output/output_runner.rs b/src/runtime/wasm/output/output_runner.rs similarity index 100% rename from src/runtime/output/output_runner.rs rename to src/runtime/wasm/output/output_runner.rs diff --git a/src/runtime/output/protocol/kafka/kafka_protocol.rs b/src/runtime/wasm/output/protocol/kafka/kafka_protocol.rs similarity index 100% rename from src/runtime/output/protocol/kafka/kafka_protocol.rs rename to src/runtime/wasm/output/protocol/kafka/kafka_protocol.rs diff --git a/src/runtime/output/protocol/kafka/mod.rs b/src/runtime/wasm/output/protocol/kafka/mod.rs similarity index 100% rename from src/runtime/output/protocol/kafka/mod.rs rename to src/runtime/wasm/output/protocol/kafka/mod.rs diff --git a/src/runtime/output/protocol/kafka/producer_config.rs b/src/runtime/wasm/output/protocol/kafka/producer_config.rs similarity index 100% rename from src/runtime/output/protocol/kafka/producer_config.rs rename to src/runtime/wasm/output/protocol/kafka/producer_config.rs diff --git a/src/runtime/output/protocol/mod.rs b/src/runtime/wasm/output/protocol/mod.rs similarity index 100% rename from src/runtime/output/protocol/mod.rs rename to src/runtime/wasm/output/protocol/mod.rs diff --git a/src/runtime/processor/function_error.rs b/src/runtime/wasm/processor/function_error.rs similarity index 71% rename from src/runtime/processor/function_error.rs rename to src/runtime/wasm/processor/function_error.rs index b38f8dd9..f9b8fe8e 100644 --- a/src/runtime/processor/function_error.rs +++ b/src/runtime/wasm/processor/function_error.rs @@ -1,3 +1,15 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #[derive(Debug, Clone)] pub enum FunctionErrorStage { Input, diff --git a/src/runtime/processor/mod.rs b/src/runtime/wasm/processor/mod.rs similarity index 100% rename from src/runtime/processor/mod.rs rename to src/runtime/wasm/processor/mod.rs diff --git a/src/runtime/processor/python/mod.rs b/src/runtime/wasm/processor/python/mod.rs similarity index 100% rename from src/runtime/processor/python/mod.rs rename to src/runtime/wasm/processor/python/mod.rs diff --git a/src/runtime/processor/python/python_host.rs b/src/runtime/wasm/processor/python/python_host.rs similarity index 100% rename from src/runtime/processor/python/python_host.rs rename to src/runtime/wasm/processor/python/python_host.rs diff --git a/src/runtime/processor/python/python_service.rs b/src/runtime/wasm/processor/python/python_service.rs similarity index 100% rename from src/runtime/processor/python/python_service.rs rename to src/runtime/wasm/processor/python/python_service.rs diff --git a/src/runtime/processor/wasm/input_strategy.rs b/src/runtime/wasm/processor/wasm/input_strategy.rs similarity index 100% rename from src/runtime/processor/wasm/input_strategy.rs rename to src/runtime/wasm/processor/wasm/input_strategy.rs diff --git a/src/runtime/processor/wasm/mod.rs b/src/runtime/wasm/processor/wasm/mod.rs similarity index 100% rename from src/runtime/processor/wasm/mod.rs rename to src/runtime/wasm/processor/wasm/mod.rs diff --git a/src/runtime/processor/wasm/thread_pool.rs b/src/runtime/wasm/processor/wasm/thread_pool.rs similarity index 100% rename from src/runtime/processor/wasm/thread_pool.rs rename to src/runtime/wasm/processor/wasm/thread_pool.rs diff --git a/src/runtime/processor/wasm/wasm_cache.rs b/src/runtime/wasm/processor/wasm/wasm_cache.rs similarity index 100% rename from src/runtime/processor/wasm/wasm_cache.rs rename to src/runtime/wasm/processor/wasm/wasm_cache.rs diff --git a/src/runtime/processor/wasm/wasm_host.rs b/src/runtime/wasm/processor/wasm/wasm_host.rs similarity index 100% rename from src/runtime/processor/wasm/wasm_host.rs rename to src/runtime/wasm/processor/wasm/wasm_host.rs diff --git a/src/runtime/processor/wasm/wasm_processor.rs b/src/runtime/wasm/processor/wasm/wasm_processor.rs similarity index 100% rename from src/runtime/processor/wasm/wasm_processor.rs rename to src/runtime/wasm/processor/wasm/wasm_processor.rs diff --git a/src/runtime/processor/wasm/wasm_processor_trait.rs b/src/runtime/wasm/processor/wasm/wasm_processor_trait.rs similarity index 100% rename from src/runtime/processor/wasm/wasm_processor_trait.rs rename to src/runtime/wasm/processor/wasm/wasm_processor_trait.rs diff --git a/src/runtime/processor/wasm/wasm_task.rs b/src/runtime/wasm/processor/wasm/wasm_task.rs similarity index 100% rename from src/runtime/processor/wasm/wasm_task.rs rename to src/runtime/wasm/processor/wasm/wasm_task.rs diff --git a/src/server/handler.rs b/src/server/handler.rs index 4721a5a1..82ccb803 100644 --- a/src/server/handler.rs +++ b/src/server/handler.rs @@ -14,8 +14,8 @@ use std::sync::Arc; use std::time::Instant; use arrow_ipc::writer::StreamWriter; -use log::{error, info}; use tonic::{Request, Response as TonicResponse, Status}; +use tracing::{debug, error, info, warn}; use protocol::service::FunctionInfo as ProtoFunctionInfo; use protocol::service::{ @@ -24,12 +24,11 @@ use protocol::service::{ StopFunctionRequest, function_stream_service_server::FunctionStreamService, }; -use crate::coordinator::Coordinator; use crate::coordinator::{ - CreateFunction, CreatePythonFunction, DataSet, DropFunction, ShowFunctions, - ShowFunctionsResult, StartFunction, Statement, StopFunction, + Coordinator, CreateFunction, CreatePythonFunction, DataSet, DropFunction, PythonModule, + ShowFunctions, ShowFunctionsResult, StartFunction, Statement, StopFunction, }; -use crate::sql::SqlParser; +use crate::sql::parse::parse_sql; pub struct FunctionStreamServiceImpl { coordinator: Arc, @@ -40,23 +39,70 @@ impl FunctionStreamServiceImpl { Self { coordinator } } - fn build_response(status_code: StatusCode, message: String, data: Option>) -> Response { + fn serialize_dataset(ds: &dyn DataSet) -> Result, String> { + let batch = ds.to_record_batch(); + let mut buf = Vec::new(); + + let mut writer = StreamWriter::try_new(&mut buf, &batch.schema()) + .map_err(|e| format!("IPC writer initialization failed: {e}"))?; + + writer + .write(&batch) + .map_err(|e| format!("IPC write failed: {e}"))?; + + writer + .finish() + .map_err(|e| format!("IPC finish failed: {e}"))?; + + Ok(buf) + } + + fn build_success_response( + status: StatusCode, + message: String, + data: Option>, + ) -> Response { + let payload = match data { + Some(ds) => match Self::serialize_dataset(ds.as_ref()) { + Ok(bytes) => Some(bytes), + Err(e) => { + error!("Data serialization error: {}", e); + return Self::build_error_response( + StatusCode::InternalServerError, + "Internal data serialization error".to_string(), + ); + } + }, + None => None, + }; + Response { - status_code: status_code as i32, + status_code: status as i32, message, - data, + data: payload, } } - fn data_set_to_ipc_bytes(ds: &dyn DataSet) -> Option> { - let batch = ds.to_record_batch(); - let mut buf = Vec::new(); - { - let mut writer = StreamWriter::try_new(&mut buf, &batch.schema()).ok()?; - writer.write(&batch).ok()?; - writer.finish().ok()?; + fn build_error_response(status: StatusCode, message: String) -> Response { + Response { + status_code: status as i32, + message, + data: None, + } + } + + async fn execute_statement( + &self, + stmt: &dyn Statement, + success_status: StatusCode, + ) -> Response { + let result = self.coordinator.execute_with_stream_catalog(stmt).await; + + if result.success { + Self::build_success_response(success_status, result.message, result.data) + } else { + Self::build_error_response(StatusCode::InternalServerError, result.message) } - Some(buf) } } @@ -66,225 +112,137 @@ impl FunctionStreamService for FunctionStreamServiceImpl { &self, request: Request, ) -> Result, Status> { - let start_time = Instant::now(); + let timer = Instant::now(); let req = request.into_inner(); - let parse_start = Instant::now(); - let stmt = match SqlParser::parse(&req.sql) { - Ok(stmt) => { - log::debug!("SQL parsed in {}ms", parse_start.elapsed().as_millis()); - stmt - } - Err(e) => { - return Ok(TonicResponse::new(Self::build_response( - StatusCode::BadRequest, - format!("Parse error: {}", e), - None, + let statements = parse_sql(&req.sql).map_err(|e| { + let detail = e.to_string(); + warn!("SQL parse rejection: {}", detail); + Status::invalid_argument(detail) + })?; + + if statements.is_empty() { + return Ok(TonicResponse::new(Self::build_success_response( + StatusCode::Ok, + "No statements executed".to_string(), + None, + ))); + } + + let mut final_response = None; + + for stmt in statements { + let result = self + .coordinator + .execute_with_stream_catalog(stmt.as_ref()) + .await; + + if !result.success { + error!("SQL execution aborted: {}", result.message); + return Ok(TonicResponse::new(Self::build_error_response( + StatusCode::InternalServerError, + result.message, ))); } - }; - - let exec_start = Instant::now(); - let result = self.coordinator.execute(stmt.as_ref()); - log::debug!( - "Coordinator execution finished in {}ms", - exec_start.elapsed().as_millis() - ); - let status_code = if result.success { - StatusCode::Ok - } else { - error!("Execution failed: {}", result.message); - StatusCode::InternalServerError - }; + final_response = Some(result); + } - log::debug!( - "Total SQL request cost: {}ms", - start_time.elapsed().as_millis() - ); + let result = final_response.unwrap(); + let response = Self::build_success_response(StatusCode::Ok, result.message, result.data); - Ok(TonicResponse::new(Self::build_response( - status_code, - result.message, - result - .data - .as_ref() - .and_then(|ds| Self::data_set_to_ipc_bytes(ds.as_ref())), - ))) + debug!("execute_sql completed in {}ms", timer.elapsed().as_millis()); + Ok(TonicResponse::new(response)) } async fn create_function( &self, request: Request, ) -> Result, Status> { - let start_time = Instant::now(); + let timer = Instant::now(); let req = request.into_inner(); - info!( - "Received CreateFunction request. Config size: {}, Function size: {}", - req.config_bytes.len(), - req.function_bytes.len() - ); - - let config_bytes = if !req.config_bytes.is_empty() { - Some(req.config_bytes) - } else { - None - }; + let config_bytes = (!req.config_bytes.is_empty()).then_some(req.config_bytes); let stmt = CreateFunction::from_bytes(req.function_bytes, config_bytes); - let exec_start = Instant::now(); - let result = self.coordinator.execute(&stmt as &dyn Statement); - info!( - "Coordinator execution finished in {}ms", - exec_start.elapsed().as_millis() - ); - - let status_code = if result.success { - StatusCode::Created - } else { - error!("CreateFunction failed: {}", result.message); - StatusCode::InternalServerError - }; + let response = self.execute_statement(&stmt, StatusCode::Created).await; info!( - "Total CreateFunction request cost: {}ms", - start_time.elapsed().as_millis() + "create_function completed in {}ms", + timer.elapsed().as_millis() ); - - Ok(TonicResponse::new(Self::build_response( - status_code, - result.message, - result - .data - .as_ref() - .and_then(|ds| Self::data_set_to_ipc_bytes(ds.as_ref())), - ))) + Ok(TonicResponse::new(response)) } async fn create_python_function( &self, request: Request, ) -> Result, Status> { - let start_time = Instant::now(); + let timer = Instant::now(); let req = request.into_inner(); - info!( - "Received CreatePythonFunction request. Class name: {}, Modules: {}", - req.class_name, - req.modules.len() - ); - // Convert proto modules to PythonModule - let modules: Vec = req + if req.modules.is_empty() { + return Ok(TonicResponse::new(Self::build_error_response( + StatusCode::BadRequest, + "Python function creation requires at least one module".to_string(), + ))); + } + + let modules: Vec = req .modules .into_iter() - .map(|m| crate::coordinator::PythonModule { + .map(|m| PythonModule { name: m.module_name, bytes: m.module_bytes, }) .collect(); - if modules.is_empty() { - return Ok(TonicResponse::new(Self::build_response( - StatusCode::BadRequest, - "At least one module is required".to_string(), - None, - ))); - } - let stmt = CreatePythonFunction::new(req.class_name, modules, req.config_content); - - let exec_start = Instant::now(); - let result = self.coordinator.execute(&stmt as &dyn Statement); - info!( - "Coordinator execution finished in {}ms", - exec_start.elapsed().as_millis() - ); - - let status_code = if result.success { - StatusCode::Created - } else { - error!("CreatePythonFunction failed: {}", result.message); - StatusCode::InternalServerError - }; + let response = self.execute_statement(&stmt, StatusCode::Created).await; info!( - "Total CreatePythonFunction request cost: {}ms", - start_time.elapsed().as_millis() + "create_python_function completed in {}ms", + timer.elapsed().as_millis() ); - - Ok(TonicResponse::new(Self::build_response( - status_code, - result.message, - result - .data - .as_ref() - .and_then(|ds| Self::data_set_to_ipc_bytes(ds.as_ref())), - ))) + Ok(TonicResponse::new(response)) } async fn drop_function( &self, request: Request, ) -> Result, Status> { - let start_time = Instant::now(); + let timer = Instant::now(); let req = request.into_inner(); - info!( - "Received DropFunction request: function_name={}", - req.function_name - ); let stmt = DropFunction::new(req.function_name); - let exec_start = Instant::now(); - let result = self.coordinator.execute(&stmt as &dyn Statement); - info!( - "Coordinator execution finished in {}ms", - exec_start.elapsed().as_millis() - ); - - let status_code = if result.success { - StatusCode::Ok - } else { - error!("DropFunction failed: {}", result.message); - StatusCode::InternalServerError - }; + let response = self.execute_statement(&stmt, StatusCode::Ok).await; info!( - "Total DropFunction request cost: {}ms", - start_time.elapsed().as_millis() + "drop_function completed in {}ms", + timer.elapsed().as_millis() ); - - Ok(TonicResponse::new(Self::build_response( - status_code, - result.message, - None, - ))) + Ok(TonicResponse::new(response)) } async fn show_functions( &self, - request: Request, + _request: Request, ) -> Result, Status> { - let start_time = Instant::now(); - let _req = request.into_inner(); - info!("Received ShowFunctions request"); - + let timer = Instant::now(); let stmt = ShowFunctions::new(); - let exec_start = Instant::now(); - let result = self.coordinator.execute(&stmt as &dyn Statement); - info!( - "Coordinator execution finished in {}ms", - exec_start.elapsed().as_millis() - ); - let (status_code, message) = if result.success { - (StatusCode::Ok as i32, result.message) - } else { - error!("ShowFunctions failed: {}", result.message); - (StatusCode::InternalServerError as i32, result.message) - }; + let result = self.coordinator.execute_with_stream_catalog(&stmt).await; + + if !result.success { + error!("show_functions execution failed: {}", result.message); + return Ok(TonicResponse::new(ShowFunctionsResponse { + status_code: StatusCode::InternalServerError as i32, + message: result.message, + functions: vec![], + })); + } - let functions: Vec = result + let functions = result .data .as_ref() .and_then(|arc_ds| { @@ -303,14 +261,12 @@ impl FunctionStreamService for FunctionStreamServiceImpl { .unwrap_or_default(); info!( - "Total ShowFunctions request cost: {}ms, count={}", - start_time.elapsed().as_millis(), - functions.len() + "show_functions completed in {}ms", + timer.elapsed().as_millis() ); - Ok(TonicResponse::new(ShowFunctionsResponse { - status_code, - message, + status_code: StatusCode::Ok as i32, + message: result.message, functions, })) } @@ -319,76 +275,34 @@ impl FunctionStreamService for FunctionStreamServiceImpl { &self, request: Request, ) -> Result, Status> { - let start_time = Instant::now(); + let timer = Instant::now(); let req = request.into_inner(); - info!( - "Received StartFunction request: function_name={}", - req.function_name - ); let stmt = StartFunction::new(req.function_name); - let exec_start = Instant::now(); - let result = self.coordinator.execute(&stmt as &dyn Statement); - info!( - "Coordinator execution finished in {}ms", - exec_start.elapsed().as_millis() - ); - - let status_code = if result.success { - StatusCode::Ok - } else { - error!("StartFunction failed: {}", result.message); - StatusCode::InternalServerError - }; + let response = self.execute_statement(&stmt, StatusCode::Ok).await; info!( - "Total StartFunction request cost: {}ms", - start_time.elapsed().as_millis() + "start_function completed in {}ms", + timer.elapsed().as_millis() ); - - Ok(TonicResponse::new(Self::build_response( - status_code, - result.message, - None, - ))) + Ok(TonicResponse::new(response)) } async fn stop_function( &self, request: Request, ) -> Result, Status> { - let start_time = Instant::now(); + let timer = Instant::now(); let req = request.into_inner(); - info!( - "Received StopFunction request: function_name={}", - req.function_name - ); let stmt = StopFunction::new(req.function_name); - let exec_start = Instant::now(); - let result = self.coordinator.execute(&stmt as &dyn Statement); - info!( - "Coordinator execution finished in {}ms", - exec_start.elapsed().as_millis() - ); - - let status_code = if result.success { - StatusCode::Ok - } else { - error!("StopFunction failed: {}", result.message); - StatusCode::InternalServerError - }; + let response = self.execute_statement(&stmt, StatusCode::Ok).await; info!( - "Total StopFunction request cost: {}ms", - start_time.elapsed().as_millis() + "stop_function completed in {}ms", + timer.elapsed().as_millis() ); - - Ok(TonicResponse::new(Self::build_response( - status_code, - result.message, - None, - ))) + Ok(TonicResponse::new(response)) } } diff --git a/src/server/initializer.rs b/src/server/initializer.rs index ccb02788..785321b8 100644 --- a/src/server/initializer.rs +++ b/src/server/initializer.rs @@ -10,15 +10,23 @@ // See the License for the specific language governing permissions and // limitations under the License. -use crate::config::GlobalConfig; +use std::time::Instant; + use anyhow::{Context, Result}; +use tracing::{debug, info, warn}; + +use crate::config::GlobalConfig; -type InitializerFn = fn(&GlobalConfig) -> Result<()>; +pub type InitializerFn = fn(&GlobalConfig) -> Result<()>; #[derive(Clone)] -struct Component { - name: &'static str, - initializer: InitializerFn, +pub struct Component { + pub name: &'static str, + pub initializer: InitializerFn, +} + +pub struct ComponentRegistry { + components: Vec, } #[derive(Default)] @@ -27,25 +35,17 @@ pub struct ComponentRegistryBuilder { } impl ComponentRegistryBuilder { - #[inline] pub fn new() -> Self { - Self::with_capacity(8) - } - - #[inline] - pub fn with_capacity(capacity: usize) -> Self { Self { - components: Vec::with_capacity(capacity), + components: Vec::with_capacity(8), } } - #[inline] pub fn register(mut self, name: &'static str, initializer: InitializerFn) -> Self { self.components.push(Component { name, initializer }); self } - #[inline] pub fn build(self) -> ComponentRegistry { ComponentRegistry { components: self.components, @@ -53,57 +53,74 @@ impl ComponentRegistryBuilder { } } -pub struct ComponentRegistry { - components: Vec, -} - impl ComponentRegistry { pub fn initialize_all(&self, config: &GlobalConfig) -> Result<()> { if self.components.is_empty() { - log::warn!("No components registered for initialization"); + warn!("Component registry is empty; no components to initialize"); return Ok(()); } - log::info!("Initializing {} components...", self.components.len()); + let total = self.components.len(); + info!( + total_components = total, + "Commencing system initialization sequence" + ); - for (idx, component) in self.components.iter().enumerate() { - let start = std::time::Instant::now(); - log::debug!( - "[{}/{}] Initializing component: {}", - idx + 1, - self.components.len(), - component.name + for (index, component) in self.components.iter().enumerate() { + let start_time = Instant::now(); + + debug!( + component = component.name, + step = format!("{}/{}", index + 1, total), + "Initializing component" ); - (component.initializer)(config) - .with_context(|| format!("Component '{}' initialization failed", component.name))?; + (component.initializer)(config).with_context(|| { + format!("Fatal error initializing component: {}", component.name) + })?; - let elapsed = start.elapsed(); - log::debug!( - "[{}/{}] Component '{}' initialized successfully in {:?}", - idx + 1, - self.components.len(), - component.name, - elapsed + debug!( + component = component.name, + elapsed_ms = start_time.elapsed().as_millis(), + "Component initialized successfully" ); } - log::info!( - "All {} components initialized successfully", - self.components.len() - ); + info!("System initialization sequence completed successfully"); Ok(()) } +} - #[inline] - pub fn len(&self) -> usize { - self.components.len() - } +pub fn build_core_registry() -> ComponentRegistry { + let builder = { + let b = ComponentRegistryBuilder::new() + .register("WasmCache", initialize_wasm_cache) + .register("TaskManager", initialize_task_manager) + .register("JobManager", initialize_job_manager); + #[cfg(feature = "python")] + let b = b.register("PythonService", initialize_python_service); + b + }; - #[inline] - pub fn is_empty(&self) -> bool { - self.components.is_empty() - } + builder + .register( + "StreamCatalog", + crate::storage::stream_catalog::initialize_stream_catalog, + ) + .register("Coordinator", initialize_coordinator) + .build() +} + +pub fn bootstrap_system(config: &GlobalConfig) -> Result<()> { + let registry = build_core_registry(); + + registry.initialize_all(config)?; + + crate::storage::stream_catalog::restore_global_catalog_from_store(); + crate::storage::stream_catalog::restore_streaming_jobs_from_store(); + + info!("System bootstrap finished. Node is ready to accept traffic."); + Ok(()) } fn initialize_wasm_cache(config: &GlobalConfig) -> Result<()> { @@ -114,18 +131,20 @@ fn initialize_wasm_cache(config: &GlobalConfig) -> Result<()> { max_size: config.wasm.max_cache_size, }, ); - log::info!( - "WASM cache configuration: enabled={}, dir={}, max_size={} bytes", - config.wasm.enable_cache, - config.wasm.cache_dir, - config.wasm.max_cache_size + + debug!( + enabled = config.wasm.enable_cache, + dir = %config.wasm.cache_dir, + max_size = config.wasm.max_cache_size, + "WASM cache configured" ); + Ok(()) } fn initialize_task_manager(config: &GlobalConfig) -> Result<()> { crate::runtime::taskexecutor::TaskManager::init(config) - .context("TaskManager initialization failed")?; + .context("TaskManager service failed to start")?; Ok(()) } @@ -136,24 +155,33 @@ fn initialize_python_service(config: &GlobalConfig) -> Result<()> { Ok(()) } -fn initialize_coordinator(_config: &GlobalConfig) -> Result<()> { - crate::runtime::taskexecutor::TaskManager::get() - .context("Coordinator requires TaskManager to be initialized first")?; - log::info!("Coordinator verified and ready"); +fn initialize_job_manager(config: &GlobalConfig) -> Result<()> { + use crate::runtime::streaming::factory::OperatorFactory; + use crate::runtime::streaming::factory::Registry; + use crate::runtime::streaming::job::JobManager; + use std::sync::Arc; + + let registry = Arc::new(Registry::new()); + let factory = Arc::new(OperatorFactory::new(registry)); + let max_memory_bytes = config + .streaming + .max_memory_bytes + .unwrap_or(256 * 1024 * 1024); + + JobManager::init(factory, max_memory_bytes).context("JobManager service failed to start")?; + Ok(()) } -pub fn register_components() -> ComponentRegistry { - let builder = { - let b = ComponentRegistryBuilder::new() - .register("WasmCache", initialize_wasm_cache) - .register("TaskManager", initialize_task_manager); - #[cfg(feature = "python")] - let b = b.register("PythonService", initialize_python_service); - b - }; +fn initialize_coordinator(_config: &GlobalConfig) -> Result<()> { + crate::runtime::taskexecutor::TaskManager::get() + .context("Dependency violation: Coordinator requires TaskManager")?; - builder - .register("Coordinator", initialize_coordinator) - .build() + crate::storage::stream_catalog::CatalogManager::global() + .context("Dependency violation: Coordinator requires StreamCatalog")?; + + crate::runtime::streaming::job::JobManager::global() + .context("Dependency violation: Coordinator requires JobManager")?; + + Ok(()) } diff --git a/src/server/mod.rs b/src/server/mod.rs index 03254af3..cb7a4a85 100644 --- a/src/server/mod.rs +++ b/src/server/mod.rs @@ -17,5 +17,5 @@ mod initializer; mod service; pub use handler::FunctionStreamServiceImpl; -pub use initializer::register_components; +pub use initializer::bootstrap_system; pub use service::start_server_with_shutdown; diff --git a/src/sql/analysis/aggregate_rewriter.rs b/src/sql/analysis/aggregate_rewriter.rs new file mode 100644 index 00000000..d7be0db8 --- /dev/null +++ b/src/sql/analysis/aggregate_rewriter.rs @@ -0,0 +1,278 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion::common::tree_node::{Transformed, TreeNodeRewriter}; +use datafusion::common::{DFSchema, DataFusionError, Result, not_impl_err, plan_err}; +use datafusion::functions_aggregate::expr_fn::max; +use datafusion::logical_expr::{Aggregate, Expr, Extension, LogicalPlan, Projection}; +use datafusion::prelude::col; +use std::sync::Arc; + +use crate::sql::analysis::streaming_window_analzer::StreamingWindowAnalzer; +use crate::sql::logical_node::aggregate::StreamWindowAggregateNode; +use crate::sql::logical_node::key_calculation::{KeyExtractionNode, KeyExtractionStrategy}; +use crate::sql::schema::StreamSchemaProvider; +use crate::sql::types::{ + QualifiedField, TIMESTAMP_FIELD, WindowBehavior, WindowType, build_df_schema_with_metadata, + extract_qualified_fields, extract_window_type, +}; + +/// AggregateRewriter transforms batch DataFusion aggregates into streaming stateful operators. +/// It handles windowing (Tumble/Hop/Session), watermarks, and continuous updating aggregates. +pub(crate) struct AggregateRewriter<'a> { + pub schema_provider: &'a StreamSchemaProvider, +} + +impl TreeNodeRewriter for AggregateRewriter<'_> { + type Node = LogicalPlan; + + fn f_up(&mut self, node: Self::Node) -> Result> { + let LogicalPlan::Aggregate(mut agg) = node else { + return Ok(Transformed::no(node)); + }; + + // 1. Identify windowing functions (e.g., tumble, hop) in GROUP BY. + let mut window_exprs: Vec<_> = agg + .group_expr + .iter() + .enumerate() + .filter_map(|(i, e)| { + extract_window_type(e) + .map(|opt| opt.map(|w| (i, w))) + .transpose() + }) + .collect::>>()?; + + if window_exprs.len() > 1 { + return not_impl_err!("Streaming aggregates support at most one window expression"); + } + + // 2. Prepare internal metadata for Key-based distribution. + let mut key_fields: Vec = extract_qualified_fields(&agg.schema) + .iter() + .take(agg.group_expr.len()) + .map(|f| { + QualifiedField::new( + f.qualifier().cloned(), + format!("_key_{}", f.name()), + f.data_type().clone(), + f.is_nullable(), + ) + }) + .collect(); + + // 3. Dispatch to Updating Aggregate if no windowing is detected. + let input_window = StreamingWindowAnalzer::get_window(&agg.input)?; + if window_exprs.is_empty() && input_window.is_none() { + return self.rewrite_as_updating_aggregate( + agg.input, + key_fields, + agg.group_expr, + agg.aggr_expr, + agg.schema, + ); + } + + // 4. Resolve Windowing Strategy (InData vs FromOperator). + let behavior = self.resolve_window_context( + &agg.input, + &mut agg.group_expr, + &agg.schema, + &mut window_exprs, + )?; + + // Adjust keys if windowing is handled by the operator. + if let WindowBehavior::FromOperator { window_index, .. } = &behavior { + key_fields.remove(*window_index); + } + + let key_count = key_fields.len(); + let keyed_input = + self.build_keyed_input(agg.input.clone(), &agg.group_expr, &key_fields)?; + + // 5. Build the final StreamWindowAggregateNode for the physical planner. + let mut internal_fields = extract_qualified_fields(&agg.schema); + if let WindowBehavior::FromOperator { window_index, .. } = &behavior { + internal_fields.remove(*window_index); + } + let internal_schema = Arc::new(build_df_schema_with_metadata( + &internal_fields, + agg.schema.metadata().clone(), + )?); + + let rewritten_agg = Aggregate::try_new_with_schema( + Arc::new(keyed_input), + agg.group_expr, + agg.aggr_expr, + internal_schema, + )?; + + let extension = StreamWindowAggregateNode::try_new( + behavior, + LogicalPlan::Aggregate(rewritten_agg), + (0..key_count).collect(), + )?; + + Ok(Transformed::yes(LogicalPlan::Extension(Extension { + node: Arc::new(extension), + }))) + } +} + +impl<'a> AggregateRewriter<'a> { + pub fn new(schema_provider: &'a StreamSchemaProvider) -> Self { + Self { schema_provider } + } + + /// [Internal] Builds the physical Key Calculation layer required for distributed Shuffling. + /// This wraps the input in a Projection and a KeyExtractionNode. + fn build_keyed_input( + &self, + input: Arc, + group_expr: &[Expr], + key_fields: &[QualifiedField], + ) -> Result { + let key_count = group_expr.len(); + let mut projection_fields = key_fields.to_vec(); + projection_fields.extend(extract_qualified_fields(input.schema())); + + let key_schema = Arc::new(build_df_schema_with_metadata( + &projection_fields, + input.schema().metadata().clone(), + )?); + + // Map group expressions to '_key_' aliases while passing through all original columns. + let mut exprs: Vec<_> = group_expr + .iter() + .zip(key_fields.iter()) + .map(|(expr, f)| expr.clone().alias(f.name().to_string())) + .collect(); + + exprs.extend( + extract_qualified_fields(input.schema()) + .iter() + .map(|f| Expr::Column(f.qualified_column())), + ); + + let projection = + LogicalPlan::Projection(Projection::try_new_with_schema(exprs, input, key_schema)?); + + Ok(LogicalPlan::Extension(Extension { + node: Arc::new(KeyExtractionNode::new( + projection, + KeyExtractionStrategy::ColumnIndices((0..key_count).collect()), + )), + })) + } + + /// [Strategy] Rewrites standard GROUP BY into a non-windowed updating aggregate. + /// Injected max(_timestamp) ensures the streaming pulse (Watermark) continues to propagate. + fn rewrite_as_updating_aggregate( + &self, + input: Arc, + key_fields: Vec, + group_expr: Vec, + mut aggr_expr: Vec, + schema: Arc, + ) -> Result> { + let keyed_input = self.build_keyed_input(input, &group_expr, &key_fields)?; + + // Ensure the updating stream maintains time awareness. + let timestamp_col = keyed_input + .schema() + .qualified_field_with_unqualified_name(TIMESTAMP_FIELD) + .map_err(|_| { + DataFusionError::Plan( + "Required _timestamp field missing for updating aggregate".to_string(), + ) + })?; + + let timestamp_field: QualifiedField = timestamp_col.into(); + aggr_expr.push(max(col(timestamp_field.qualified_column())).alias(TIMESTAMP_FIELD)); + + let mut output_fields = extract_qualified_fields(&schema); + output_fields.push(timestamp_field); + + let output_schema = Arc::new(build_df_schema_with_metadata( + &output_fields, + schema.metadata().clone(), + )?); + + let aggregate = Aggregate::try_new_with_schema( + Arc::new(keyed_input), + group_expr, + aggr_expr, + output_schema, + )?; + + Ok(Transformed::yes(LogicalPlan::Aggregate(aggregate))) + } + + /// [Strategy] Reconciles window definitions between the input stream and the current GROUP BY. + fn resolve_window_context( + &self, + input: &LogicalPlan, + group_expr: &mut Vec, + schema: &DFSchema, + window_expr_info: &mut Vec<(usize, WindowType)>, + ) -> Result { + let mut visitor = StreamingWindowAnalzer::default(); + input.visit_with_subqueries(&mut visitor)?; + + let input_window = visitor.window; + let has_group_window = !window_expr_info.is_empty(); + + match (input_window, has_group_window) { + // Re-aggregation or subquery with an existing window. + (Some(i_win), true) => { + let (idx, g_win) = window_expr_info.pop().unwrap(); + if i_win != g_win { + return plan_err!( + "Inconsistent windowing: input is {:?}, but group by is {:?}", + i_win, + g_win + ); + } + + if let Some(field) = visitor.fields.iter().next() { + group_expr[idx] = Expr::Column(field.qualified_column()); + Ok(WindowBehavior::InData) + } else { + if matches!(i_win, WindowType::Session { .. }) { + return plan_err!("Nested session windows are not supported"); + } + group_expr.remove(idx); + Ok(WindowBehavior::FromOperator { + window: i_win, + window_field: schema.qualified_field(idx).into(), + window_index: idx, + is_nested: true, + }) + } + } + // First-time windowing defined in this aggregate. + (None, true) => { + let (idx, g_win) = window_expr_info.pop().unwrap(); + group_expr.remove(idx); + Ok(WindowBehavior::FromOperator { + window: g_win, + window_field: schema.qualified_field(idx).into(), + window_index: idx, + is_nested: false, + }) + } + // Passthrough: input is already windowed, no new window in group by. + (Some(_), false) => Ok(WindowBehavior::InData), + _ => unreachable!("Dispatched to non-windowed path previously"), + } + } +} diff --git a/src/sql/analysis/async_udf_rewriter.rs b/src/sql/analysis/async_udf_rewriter.rs new file mode 100644 index 00000000..d6d9b54b --- /dev/null +++ b/src/sql/analysis/async_udf_rewriter.rs @@ -0,0 +1,133 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::sql::common::constants::sql_field; +use crate::sql::logical_node::AsyncFunctionExecutionNode; +use crate::sql::logical_node::remote_table::RemoteTableBoundaryNode; +use crate::sql::schema::StreamSchemaProvider; +use datafusion::common::tree_node::{Transformed, TreeNode, TreeNodeRewriter}; +use datafusion::common::{Column, Result as DFResult, TableReference, plan_err}; +use datafusion::logical_expr::expr::ScalarFunction; +use datafusion::logical_expr::{Expr, Extension, LogicalPlan}; +use std::sync::Arc; +use std::time::Duration; + +type AsyncSplitResult = (String, AsyncOptions, Vec); + +#[derive(Debug, Clone, Copy)] +pub struct AsyncOptions { + pub ordered: bool, + pub max_concurrency: usize, + pub timeout: Duration, +} + +pub struct AsyncUdfRewriter<'a> { + provider: &'a StreamSchemaProvider, +} + +impl<'a> AsyncUdfRewriter<'a> { + pub fn new(provider: &'a StreamSchemaProvider) -> Self { + Self { provider } + } + + fn split_async( + expr: Expr, + provider: &StreamSchemaProvider, + ) -> DFResult<(Expr, Option)> { + let mut found: Option<(String, AsyncOptions, Vec)> = None; + let expr = expr.transform_up(|e| { + if let Expr::ScalarFunction(ScalarFunction { func: udf, args }) = &e + && let Some(opts) = provider.get_async_udf_options(udf.name()) + { + if found + .replace((udf.name().to_string(), opts, args.clone())) + .is_some() + { + return plan_err!( + "multiple async calls in the same expression, which is not allowed" + ); + } + return Ok(Transformed::yes(Expr::Column(Column::new_unqualified( + sql_field::ASYNC_RESULT, + )))); + } + Ok(Transformed::no(e)) + })?; + + Ok((expr.data, found)) + } +} + +impl TreeNodeRewriter for AsyncUdfRewriter<'_> { + type Node = LogicalPlan; + + fn f_up(&mut self, node: Self::Node) -> DFResult> { + let LogicalPlan::Projection(mut projection) = node else { + for e in node.expressions() { + if let (_, Some((udf, _, _))) = Self::split_async(e.clone(), self.provider)? { + return plan_err!( + "async UDFs are only supported in projections, but {udf} was called in another context" + ); + } + } + return Ok(Transformed::no(node)); + }; + + let mut args = None; + for e in projection.expr.iter_mut() { + let (new_e, Some(udf)) = Self::split_async(e.clone(), self.provider)? else { + continue; + }; + if let Some((prev, _, _)) = args.replace(udf) { + return plan_err!( + "Projection contains multiple async UDFs, which is not supported \ + \n(hint: two async UDF calls, {} and {}, appear in the same SELECT statement)", + prev, + args.unwrap().0 + ); + } + *e = new_e; + } + + let Some((name, opts, arg_exprs)) = args else { + return Ok(Transformed::no(LogicalPlan::Projection(projection))); + }; + let udf = self.provider.dylib_udfs.get(&name).unwrap().clone(); + + let input = if matches!(*projection.input, LogicalPlan::Projection(..)) { + Arc::new(LogicalPlan::Extension(Extension { + node: Arc::new(RemoteTableBoundaryNode { + upstream_plan: (*projection.input).clone(), + table_identifier: TableReference::bare("subquery_projection"), + resolved_schema: projection.input.schema().clone(), + requires_materialization: false, + }), + })) + } else { + projection.input + }; + + Ok(Transformed::yes(LogicalPlan::Extension(Extension { + node: Arc::new(AsyncFunctionExecutionNode { + upstream_plan: input, + operator_name: name, + function_config: udf, + invocation_args: arg_exprs, + result_projections: projection.expr, + preserve_ordering: opts.ordered, + concurrency_limit: opts.max_concurrency, + execution_timeout: opts.timeout, + resolved_schema: projection.schema, + }), + }))) + } +} diff --git a/src/sql/analysis/join_rewriter.rs b/src/sql/analysis/join_rewriter.rs new file mode 100644 index 00000000..8a9e5280 --- /dev/null +++ b/src/sql/analysis/join_rewriter.rs @@ -0,0 +1,234 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::sql::analysis::streaming_window_analzer::StreamingWindowAnalzer; +use crate::sql::common::TIMESTAMP_FIELD; +use crate::sql::common::constants::mem_exec_join_side; +use crate::sql::logical_node::join::StreamingJoinNode; +use crate::sql::logical_node::key_calculation::KeyExtractionNode; +use crate::sql::schema::StreamSchemaProvider; +use crate::sql::types::{WindowType, build_df_schema_with_metadata, extract_qualified_fields}; +use datafusion::common::tree_node::{Transformed, TreeNodeRewriter}; +use datafusion::common::{ + JoinConstraint, JoinType, Result, ScalarValue, TableReference, not_impl_err, plan_err, +}; +use datafusion::logical_expr::{ + self, BinaryExpr, Case, Expr, Extension, Join, LogicalPlan, Projection, build_join_schema, +}; +use datafusion::prelude::coalesce; +use std::sync::Arc; + +/// JoinRewriter handles the transformation of standard SQL joins into streaming-capable joins. +/// It manages stateful "Updating Joins" and time-aligned "Instant Joins". +pub(crate) struct JoinRewriter<'a> { + pub schema_provider: &'a StreamSchemaProvider, +} + +impl<'a> JoinRewriter<'a> { + pub fn new(schema_provider: &'a StreamSchemaProvider) -> Self { + Self { schema_provider } + } + + /// [Validation] Ensures left and right streams have compatible windowing strategies. + fn validate_join_windows(&self, join: &Join) -> Result { + let left_win = StreamingWindowAnalzer::get_window(&join.left)?; + let right_win = StreamingWindowAnalzer::get_window(&join.right)?; + + match (left_win, right_win) { + (None, None) => { + if join.join_type == JoinType::Inner { + Ok(false) // Standard Updating Join (Inner) + } else { + plan_err!( + "Non-inner joins (e.g., LEFT/RIGHT) require windowing to bound state." + ) + } + } + (Some(l), Some(r)) => { + if l != r { + return plan_err!( + "Join window mismatch: left={:?}, right={:?}. Windows must match exactly.", + l, + r + ); + } + if let WindowType::Session { .. } = l { + return plan_err!( + "Session windows are currently not supported in streaming joins." + ); + } + Ok(true) // Instant Windowed Join + } + _ => plan_err!( + "Mixed windowing detected. Both sides of a join must be either windowed or non-windowed." + ), + } + } + + /// [Internal] Wraps a join input in a key-extraction layer to facilitate shuffle / key-by distribution. + fn build_keyed_side( + &self, + input: Arc, + keys: Vec, + side: &str, + ) -> Result { + let key_count = keys.len(); + + let projection_exprs = keys + .into_iter() + .enumerate() + .map(|(i, e)| { + e.alias_qualified(Some(TableReference::bare("_stream")), format!("_key_{i}")) + }) + .chain( + extract_qualified_fields(input.schema()) + .iter() + .map(|f| Expr::Column(f.qualified_column())), + ) + .collect(); + + let projection = Projection::try_new(projection_exprs, input)?; + let key_ext = KeyExtractionNode::try_new_with_projection( + LogicalPlan::Projection(projection), + (0..key_count).collect(), + side.to_string(), + )?; + + Ok(LogicalPlan::Extension(Extension { + node: Arc::new(key_ext), + })) + } + + /// [Strategy] Resolves the output timestamp of the join. + /// Streaming joins must output the 'max' of the two input timestamps to ensure Watermark progression. + fn apply_timestamp_resolution(&self, join_plan: LogicalPlan) -> Result { + let schema = join_plan.schema(); + let all_fields = extract_qualified_fields(schema); + + let timestamp_fields: Vec<_> = all_fields + .iter() + .filter(|f| f.name() == "_timestamp") + .cloned() + .collect(); + + if timestamp_fields.len() != 2 { + return plan_err!( + "Streaming join requires exactly two input timestamp fields to resolve output time." + ); + } + + // Project all fields except the two raw timestamps + let mut exprs: Vec<_> = all_fields + .iter() + .filter(|f| f.name() != "_timestamp") + .map(|f| Expr::Column(f.qualified_column())) + .collect(); + + // Calculate: GREATEST(left._timestamp, right._timestamp) + let left_ts = Expr::Column(timestamp_fields[0].qualified_column()); + let right_ts = Expr::Column(timestamp_fields[1].qualified_column()); + + let max_ts_expr = Expr::Case(Case { + expr: Some(Box::new(Expr::BinaryExpr(BinaryExpr { + left: Box::new(left_ts.clone()), + op: logical_expr::Operator::GtEq, + right: Box::new(right_ts.clone()), + }))), + when_then_expr: vec![ + ( + Box::new(Expr::Literal(ScalarValue::Boolean(Some(true)), None)), + Box::new(left_ts.clone()), + ), + ( + Box::new(Expr::Literal(ScalarValue::Boolean(Some(false)), None)), + Box::new(right_ts.clone()), + ), + ], + else_expr: Some(Box::new(coalesce(vec![left_ts, right_ts]))), + }) + .alias(TIMESTAMP_FIELD); + + exprs.push(max_ts_expr); + + let out_fields: Vec<_> = all_fields + .iter() + .filter(|f| f.name() != "_timestamp") + .cloned() + .chain(std::iter::once(timestamp_fields[0].clone())) + .collect(); + + let out_schema = Arc::new(build_df_schema_with_metadata( + &out_fields, + schema.metadata().clone(), + )?); + + Ok(LogicalPlan::Projection(Projection::try_new_with_schema( + exprs, + Arc::new(join_plan), + out_schema, + )?)) + } +} + +impl TreeNodeRewriter for JoinRewriter<'_> { + type Node = LogicalPlan; + + fn f_up(&mut self, node: Self::Node) -> Result> { + let LogicalPlan::Join(join) = node else { + return Ok(Transformed::no(node)); + }; + + // 1. Validate Streaming Context + let is_instant = self.validate_join_windows(&join)?; + if join.join_constraint != JoinConstraint::On { + return not_impl_err!("Only 'ON' join constraints are supported in streaming SQL."); + } + if join.on.is_empty() && !is_instant { + return plan_err!("Updating joins require at least one equality condition (Equijoin)."); + } + + // 2. Prepare Keyed Inputs for Shuffle + let (left_on, right_on): (Vec<_>, Vec<_>) = join.on.clone().into_iter().unzip(); + let keyed_left = self.build_keyed_side(join.left, left_on, mem_exec_join_side::LEFT)?; + let keyed_right = self.build_keyed_side(join.right, right_on, mem_exec_join_side::RIGHT)?; + + // 3. Assemble Rewritten Join Node + let join_schema = Arc::new(build_join_schema( + keyed_left.schema(), + keyed_right.schema(), + &join.join_type, + )?); + let rewritten_join = LogicalPlan::Join(Join { + left: Arc::new(keyed_left), + right: Arc::new(keyed_right), + on: join.on, + filter: join.filter, + join_type: join.join_type, + join_constraint: JoinConstraint::On, + schema: join_schema, + null_equals_null: false, + }); + + // 4. Resolve Output Watermark (Timestamp Projection) + let plan_with_timestamp = self.apply_timestamp_resolution(rewritten_join)?; + + // 5. Wrap in StreamingJoinNode for physical planning + let state_retention_ttl = + (!is_instant).then_some(self.schema_provider.planning_options.ttl); + let extension = + StreamingJoinNode::new(plan_with_timestamp, is_instant, state_retention_ttl); + + Ok(Transformed::yes(LogicalPlan::Extension(Extension { + node: Arc::new(extension), + }))) + } +} diff --git a/src/sql/analysis/mod.rs b/src/sql/analysis/mod.rs new file mode 100644 index 00000000..019d8bf1 --- /dev/null +++ b/src/sql/analysis/mod.rs @@ -0,0 +1,214 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#![allow(clippy::new_without_default)] + +pub(crate) mod aggregate_rewriter; +pub(crate) mod join_rewriter; +pub(crate) mod row_time_rewriter; +pub(crate) mod stream_rewriter; +pub(crate) mod streaming_window_analzer; +pub(crate) mod window_function_rewriter; + +pub mod async_udf_rewriter; +pub mod sink_input_rewriter; +pub mod source_metadata_visitor; +pub mod source_rewriter; +pub mod time_window; +pub mod unnest_rewriter; + +pub use async_udf_rewriter::AsyncOptions; +pub use sink_input_rewriter::SinkInputRewriter; +pub use time_window::{TimeWindowNullCheckRemover, TimeWindowUdfChecker}; +pub use unnest_rewriter::UNNESTED_COL; + +pub use crate::sql::schema::schema_provider::StreamSchemaProvider; + +use std::collections::HashMap; +use std::sync::Arc; + +use datafusion::common::tree_node::{Transformed, TreeNode}; +use datafusion::common::{Result, plan_err}; +use datafusion::error::DataFusionError; +use datafusion::logical_expr::{Extension, LogicalPlan, UserDefinedLogicalNodeCore}; +use tracing::{debug, info, instrument}; + +use crate::sql::logical_node::StreamingOperatorBlueprint; +use crate::sql::logical_node::key_calculation::{KeyExtractionNode, KeyExtractionStrategy}; +use crate::sql::logical_node::projection::StreamProjectionNode; +use crate::sql::logical_node::sink::StreamEgressNode; +use crate::sql::logical_planner::planner::NamedNode; + +fn duration_from_sql_expr( + expr: &datafusion::sql::sqlparser::ast::Expr, +) -> Result { + use datafusion::sql::sqlparser::ast::Expr as SqlExpr; + use datafusion::sql::sqlparser::ast::Value as SqlValue; + use datafusion::sql::sqlparser::ast::ValueWithSpan; + + match expr { + SqlExpr::Interval(interval) => { + let value_str = match interval.value.as_ref() { + SqlExpr::Value(ValueWithSpan { + value: SqlValue::SingleQuotedString(s), + .. + }) => s.clone(), + other => return plan_err!("expected interval string literal, found {other}"), + }; + + parse_interval_to_duration(&value_str) + } + SqlExpr::Value(ValueWithSpan { + value: SqlValue::SingleQuotedString(s), + .. + }) => parse_interval_to_duration(s), + other => plan_err!("expected an interval expression, found {other}"), + } +} + +fn parse_interval_to_duration(s: &str) -> Result { + let parts: Vec<&str> = s.split_whitespace().collect(); + if parts.len() != 2 { + return plan_err!("invalid interval string '{s}'; expected ' '"); + } + let value: u64 = parts[0] + .parse() + .map_err(|_| DataFusionError::Plan(format!("invalid interval number: {}", parts[0])))?; + match parts[1].to_lowercase().as_str() { + "second" | "seconds" | "s" => Ok(std::time::Duration::from_secs(value)), + "minute" | "minutes" | "min" => Ok(std::time::Duration::from_secs(value * 60)), + "hour" | "hours" | "h" => Ok(std::time::Duration::from_secs(value * 3600)), + "day" | "days" | "d" => Ok(std::time::Duration::from_secs(value * 86400)), + unit => plan_err!("unsupported interval unit '{unit}'"), + } +} + +fn build_sink_inputs(extensions: &[LogicalPlan]) -> HashMap> { + let mut sink_inputs = HashMap::>::new(); + for extension in extensions.iter() { + if let LogicalPlan::Extension(ext) = extension + && let Some(sink_node) = ext.node.as_any().downcast_ref::() + && let Some(named_node) = sink_node.operator_identity() + { + let inputs = sink_node + .inputs() + .into_iter() + .cloned() + .collect::>(); + sink_inputs.entry(named_node).or_default().extend(inputs); + } + } + sink_inputs +} + +pub(crate) fn maybe_add_key_extension_to_sink(plan: LogicalPlan) -> Result { + let LogicalPlan::Extension(ref ext) = plan else { + return Ok(plan); + }; + + let Some(sink) = ext.node.as_any().downcast_ref::() else { + return Ok(plan); + }; + + let Some(partition_exprs) = sink.destination_table.partition_exprs() else { + return Ok(plan); + }; + + if partition_exprs.is_empty() { + return Ok(plan); + } + + let inputs = plan + .inputs() + .into_iter() + .map(|input| { + Ok(LogicalPlan::Extension(Extension { + node: Arc::new(KeyExtractionNode { + operator_label: Some("key-calc-partition".to_string()), + resolved_schema: input.schema().clone(), + upstream_plan: input.clone(), + extraction_strategy: KeyExtractionStrategy::CalculatedExpressions( + partition_exprs.clone(), + ), + }), + })) + }) + .collect::>()?; + + use datafusion::prelude::col; + let unkey = LogicalPlan::Extension(Extension { + node: Arc::new( + StreamProjectionNode::try_new( + inputs, + Some("unkey".to_string()), + sink.schema().iter().map(|(_, f)| col(f.name())).collect(), + )? + .with_shuffle_routing(), + ), + }); + + let node = sink.with_exprs_and_inputs(vec![], vec![unkey])?; + Ok(LogicalPlan::Extension(Extension { + node: Arc::new(node), + })) +} + +pub fn rewrite_sinks(extensions: Vec) -> Result> { + let mut sink_inputs = build_sink_inputs(&extensions); + let mut new_extensions = vec![]; + for extension in extensions { + let mut rewriter = SinkInputRewriter::new(&mut sink_inputs); + let result = extension.rewrite(&mut rewriter)?; + if !rewriter.was_removed { + new_extensions.push(result.data); + } + } + + new_extensions + .into_iter() + .map(maybe_add_key_extension_to_sink) + .collect() +} + +/// Entry point for transforming a standard DataFusion LogicalPlan into a +/// Streaming-aware LogicalPlan. +/// +/// This function coordinates multiple rewriting passes and ensures the +/// resulting plan satisfies streaming constraints. +#[instrument(skip_all, level = "debug")] +pub fn rewrite_plan( + plan: LogicalPlan, + schema_provider: &StreamSchemaProvider, +) -> Result { + info!("Starting streaming plan rewrite pipeline"); + + let Transformed { data: plan, .. } = + plan.rewrite_with_subqueries(&mut source_rewriter::SourceRewriter::new(schema_provider))?; + + let mut rewriter = stream_rewriter::StreamRewriter::new(schema_provider); + let Transformed { + data: rewritten_plan, + .. + } = plan.rewrite_with_subqueries(&mut rewriter)?; + + rewritten_plan.visit_with_subqueries(&mut TimeWindowUdfChecker {})?; + + if cfg!(debug_assertions) { + debug!( + "Streaming logical plan graphviz:\n{}", + rewritten_plan.display_graphviz() + ); + } + + info!("Streaming plan rewrite completed successfully"); + Ok(rewritten_plan) +} diff --git a/src/sql/analysis/row_time_rewriter.rs b/src/sql/analysis/row_time_rewriter.rs new file mode 100644 index 00000000..13e2a048 --- /dev/null +++ b/src/sql/analysis/row_time_rewriter.rs @@ -0,0 +1,49 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion::common::tree_node::{Transformed, TreeNodeRewriter}; +use datafusion::common::{Column, Result as DFResult}; +use datafusion::logical_expr::Expr; + +use crate::sql::common::constants::planning_placeholder_udf; +use crate::sql::types::TIMESTAMP_FIELD; + +/// Replaces the virtual `row_time()` scalar function with a physical reference to `_timestamp`. +/// +/// This is a critical mapping step that allows users to use a friendly SQL function +/// while the engine operates on the mandatory internal streaming timestamp. +pub struct RowTimeRewriter; + +impl TreeNodeRewriter for RowTimeRewriter { + type Node = Expr; + + fn f_down(&mut self, node: Self::Node) -> DFResult> { + // Use pattern matching to identify the `row_time` scalar function. + if let Expr::ScalarFunction(func) = &node + && func.name() == planning_placeholder_udf::ROW_TIME + { + // Map the virtual function to the physical internal timestamp column. + // We use .alias() to preserve the original name "row_time()" in the output schema, + // ensuring that user-facing column names do not change unexpectedly. + let physical_col = Expr::Column(Column { + relation: None, + name: TIMESTAMP_FIELD.to_string(), + spans: Default::default(), + }) + .alias("row_time()"); + + return Ok(Transformed::yes(physical_col)); + } + + Ok(Transformed::no(node)) + } +} diff --git a/src/sql/analysis/sink_input_rewriter.rs b/src/sql/analysis/sink_input_rewriter.rs new file mode 100644 index 00000000..201439cc --- /dev/null +++ b/src/sql/analysis/sink_input_rewriter.rs @@ -0,0 +1,57 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::sql::logical_node::StreamingOperatorBlueprint; +use crate::sql::logical_node::sink::StreamEgressNode; +use crate::sql::logical_planner::planner::NamedNode; +use datafusion::common::Result as DFResult; +use datafusion::common::tree_node::{Transformed, TreeNodeRecursion, TreeNodeRewriter}; +use datafusion::logical_expr::{Extension, LogicalPlan, UserDefinedLogicalNodeCore}; +use std::collections::HashMap; +use std::sync::Arc; + +type SinkInputs = HashMap>; + +/// Merges inputs for sinks with the same name to avoid duplicate sinks in the plan. +pub struct SinkInputRewriter<'a> { + sink_inputs: &'a mut SinkInputs, + pub was_removed: bool, +} + +impl<'a> SinkInputRewriter<'a> { + pub(crate) fn new(sink_inputs: &'a mut SinkInputs) -> Self { + Self { + sink_inputs, + was_removed: false, + } + } +} + +impl TreeNodeRewriter for SinkInputRewriter<'_> { + type Node = LogicalPlan; + + fn f_down(&mut self, node: Self::Node) -> DFResult> { + if let LogicalPlan::Extension(extension) = &node + && let Some(sink_node) = extension.node.as_any().downcast_ref::() + && let Some(named_node) = sink_node.operator_identity() + { + if let Some(inputs) = self.sink_inputs.remove(&named_node) { + let new_node = LogicalPlan::Extension(Extension { + node: Arc::new(sink_node.with_exprs_and_inputs(vec![], inputs)?), + }); + return Ok(Transformed::new(new_node, true, TreeNodeRecursion::Jump)); + } + self.was_removed = true; + } + Ok(Transformed::no(node)) + } +} diff --git a/src/sql/analysis/source_metadata_visitor.rs b/src/sql/analysis/source_metadata_visitor.rs new file mode 100644 index 00000000..55350301 --- /dev/null +++ b/src/sql/analysis/source_metadata_visitor.rs @@ -0,0 +1,69 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::sql::logical_node::sink::{STREAM_EGRESS_NODE_NAME, StreamEgressNode}; +use crate::sql::logical_node::table_source::{STREAM_INGESTION_NODE_NAME, StreamIngestionNode}; +use crate::sql::schema::StreamSchemaProvider; +use datafusion::common::Result as DFResult; +use datafusion::common::tree_node::{TreeNodeRecursion, TreeNodeVisitor}; +use datafusion::logical_expr::{Extension, LogicalPlan}; +use std::collections::HashSet; + +/// Collects connection IDs from source and sink nodes in the logical plan. +pub struct SourceMetadataVisitor<'a> { + schema_provider: &'a StreamSchemaProvider, + pub connection_ids: HashSet, +} + +impl<'a> SourceMetadataVisitor<'a> { + pub fn new(schema_provider: &'a StreamSchemaProvider) -> Self { + Self { + schema_provider, + connection_ids: HashSet::new(), + } + } + + fn get_connection_id(&self, node: &LogicalPlan) -> Option { + let LogicalPlan::Extension(Extension { node }) = node else { + return None; + }; + + let table_name = match node.name() { + name if name == STREAM_INGESTION_NODE_NAME => { + let ext = node.as_any().downcast_ref::()?; + ext.source_identifier.to_string() + } + name if name == STREAM_EGRESS_NODE_NAME => { + let ext = node.as_any().downcast_ref::()?; + ext.target_identifier.to_string() + } + _ => return None, + }; + + let table = self.schema_provider.get_catalog_table(&table_name)?; + match table { + crate::sql::schema::table::Table::ConnectorTable(t) => t.registry_id, + _ => None, + } + } +} + +impl TreeNodeVisitor<'_> for SourceMetadataVisitor<'_> { + type Node = LogicalPlan; + + fn f_down(&mut self, node: &Self::Node) -> DFResult { + if let Some(id) = self.get_connection_id(node) { + self.connection_ids.insert(id); + } + Ok(TreeNodeRecursion::Continue) + } +} diff --git a/src/sql/analysis/source_rewriter.rs b/src/sql/analysis/source_rewriter.rs new file mode 100644 index 00000000..0bd15e85 --- /dev/null +++ b/src/sql/analysis/source_rewriter.rs @@ -0,0 +1,299 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; +use std::time::Duration; + +use datafusion::common::ScalarValue; +use datafusion::common::tree_node::{Transformed, TreeNodeRewriter}; +use datafusion::common::{Column, DataFusionError, Result as DFResult, TableReference, plan_err}; +use datafusion::logical_expr::{ + self, BinaryExpr, Expr, Extension, LogicalPlan, Projection, TableScan, +}; + +use crate::sql::common::UPDATING_META_FIELD; +use crate::sql::logical_node::debezium::UnrollDebeziumPayloadNode; +use crate::sql::logical_node::remote_table::RemoteTableBoundaryNode; +use crate::sql::logical_node::table_source::StreamIngestionNode; +use crate::sql::logical_node::watermark_node::EventTimeWatermarkNode; +use crate::sql::schema::ColumnDescriptor; +use crate::sql::schema::StreamSchemaProvider; +use crate::sql::schema::source_table::SourceTable; +use crate::sql::schema::table::Table; +use crate::sql::types::TIMESTAMP_FIELD; + +/// Rewrites table scans: projections are lifted out of scans into a dedicated projection node +/// (including virtual fields), using a connector table-source extension instead of a bare +/// `TableScan`, optionally with Debezium unrolling for updating sources, then remote boundary and +/// watermark. +pub struct SourceRewriter<'a> { + pub(crate) schema_provider: &'a StreamSchemaProvider, +} + +impl<'a> SourceRewriter<'a> { + pub fn new(schema_provider: &'a StreamSchemaProvider) -> Self { + Self { schema_provider } + } +} + +impl SourceRewriter<'_> { + fn projection_expr_for_column(col: &ColumnDescriptor, qualifier: &TableReference) -> Expr { + if let Some(logic) = col.computation_logic() { + logic.clone().alias_qualified( + Some(qualifier.clone()), + col.arrow_field().name().to_string(), + ) + } else { + Expr::Column(Column { + relation: Some(qualifier.clone()), + name: col.arrow_field().name().to_string(), + spans: Default::default(), + }) + } + } + + fn watermark_expression(table: &SourceTable) -> DFResult { + match table.temporal_config.watermark_strategy_column.clone() { + Some(watermark_field) => table + .schema_specs + .iter() + .find_map(|c| { + if c.arrow_field().name() == watermark_field.as_str() { + return if let Some(expr) = c.computation_logic() { + Some(expr.clone()) + } else { + Some(Expr::Column(Column { + relation: None, + name: c.arrow_field().name().to_string(), + spans: Default::default(), + })) + }; + } + None + }) + .ok_or_else(|| { + DataFusionError::Plan(format!("Watermark field {watermark_field} not found")) + }), + None => Ok(Expr::BinaryExpr(BinaryExpr { + left: Box::new(Expr::Column(Column { + relation: None, + name: TIMESTAMP_FIELD.to_string(), + spans: Default::default(), + })), + op: logical_expr::Operator::Minus, + right: Box::new(Expr::Literal( + ScalarValue::DurationNanosecond(Some(Duration::from_secs(1).as_nanos() as i64)), + None, + )), + })), + } + } + + fn projection_expressions( + table: &SourceTable, + qualifier: &TableReference, + projection: &Option>, + ) -> DFResult> { + let mut expressions: Vec = table + .schema_specs + .iter() + .map(|col| Self::projection_expr_for_column(col, qualifier)) + .collect(); + + if let Some(proj) = projection { + expressions = proj.iter().map(|i| expressions[*i].clone()).collect(); + } + + if let Some(event_time_field) = table.temporal_config.event_column.clone() { + let expr = table + .schema_specs + .iter() + .find_map(|c| { + if c.arrow_field().name() == event_time_field.as_str() { + return Some(Self::projection_expr_for_column(c, qualifier)); + } + None + }) + .ok_or_else(|| { + DataFusionError::Plan(format!("Event time field {event_time_field} not found")) + })?; + + expressions + .push(expr.alias_qualified(Some(qualifier.clone()), TIMESTAMP_FIELD.to_string())); + } else { + let has_ts = table + .schema_specs + .iter() + .any(|c| c.arrow_field().name() == TIMESTAMP_FIELD); + if !has_ts { + return plan_err!( + "Connector table '{}' has no `{}` column; declare WATERMARK FOR AS ... in CREATE TABLE", + table.table_identifier, + TIMESTAMP_FIELD + ); + } + expressions.push(Expr::Column(Column::new( + Some(qualifier.clone()), + TIMESTAMP_FIELD, + ))); + } + + if table.is_updating() { + expressions.push(Expr::Column(Column::new( + Some(qualifier.clone()), + UPDATING_META_FIELD, + ))); + } + + Ok(expressions) + } + + /// Connector path: `StreamIngestionNode` (table source) → optional `UnrollDebeziumPayloadNode` + /// → `Projection`, mirroring Arroyo `TableSourceExtension` + Debezium unroll + projection. + fn projection(&self, table_scan: &TableScan, table: &SourceTable) -> DFResult { + let qualifier = table_scan.table_name.clone(); + + let table_source = LogicalPlan::Extension(Extension { + node: Arc::new(StreamIngestionNode::try_new( + qualifier.clone(), + table.clone(), + )?), + }); + + let (projection_input, scan_projection) = if table.is_updating() { + if table.key_constraints.is_empty() { + return plan_err!( + "Updating connector table `{}` requires at least one PRIMARY KEY for CDC unrolling", + table.table_identifier + ); + } + let unrolled = LogicalPlan::Extension(Extension { + node: Arc::new(UnrollDebeziumPayloadNode::try_new( + table_source, + Arc::new(table.key_constraints.clone()), + )?), + }); + (unrolled, None) + } else { + (table_source, table_scan.projection.clone()) + }; + + Ok(LogicalPlan::Projection(Projection::try_new( + Self::projection_expressions(table, &qualifier, &scan_projection)?, + Arc::new(projection_input), + )?)) + } + + fn mutate_connector_table( + &self, + table_scan: &TableScan, + table: &SourceTable, + ) -> DFResult> { + let input = self.projection(table_scan, table)?; + + let schema = input.schema().clone(); + let remote = LogicalPlan::Extension(Extension { + node: Arc::new(RemoteTableBoundaryNode { + upstream_plan: input, + table_identifier: table_scan.table_name.to_owned(), + resolved_schema: schema, + requires_materialization: true, + }), + }); + + let watermark_node = EventTimeWatermarkNode::try_new( + remote, + table_scan.table_name.clone(), + Self::watermark_expression(table)?, + ) + .map_err(|err| { + DataFusionError::Internal(format!("failed to create watermark node: {err}")) + })?; + + Ok(Transformed::yes(LogicalPlan::Extension(Extension { + node: Arc::new(watermark_node), + }))) + } + + fn mutate_table_from_query( + &self, + table_scan: &TableScan, + logical_plan: &LogicalPlan, + ) -> DFResult> { + let column_expressions: Vec<_> = if let Some(projection) = &table_scan.projection { + logical_plan + .schema() + .columns() + .into_iter() + .enumerate() + .filter_map(|(i, col)| { + if projection.contains(&i) { + Some(Expr::Column(col)) + } else { + None + } + }) + .collect() + } else { + logical_plan + .schema() + .columns() + .into_iter() + .map(Expr::Column) + .collect() + }; + + let target_columns: Vec<_> = table_scan.projected_schema.columns().into_iter().collect(); + + let expressions = column_expressions + .into_iter() + .zip(target_columns) + .map(|(expr, col)| expr.alias_qualified(col.relation, col.name)) + .collect(); + + let projection = LogicalPlan::Projection(Projection::try_new_with_schema( + expressions, + Arc::new(logical_plan.clone()), + table_scan.projected_schema.clone(), + )?); + + Ok(Transformed::yes(projection)) + } +} + +impl TreeNodeRewriter for SourceRewriter<'_> { + type Node = LogicalPlan; + + fn f_up(&mut self, node: Self::Node) -> DFResult> { + let LogicalPlan::TableScan(table_scan) = node else { + return Ok(Transformed::no(node)); + }; + + let table_name = table_scan.table_name.table(); + let table = self + .schema_provider + .get_catalog_table(table_name) + .ok_or_else(|| DataFusionError::Plan(format!("Table {table_name} not found")))?; + + match table { + Table::ConnectorTable(table) => self.mutate_connector_table(&table_scan, table), + Table::LookupTable(_table) => { + // TODO: implement LookupSource extension + plan_err!("Lookup tables are not yet supported") + } + Table::TableFromQuery { + name: _, + logical_plan, + } => self.mutate_table_from_query(&table_scan, logical_plan), + } + } +} diff --git a/src/sql/analysis/stream_rewriter.rs b/src/sql/analysis/stream_rewriter.rs new file mode 100644 index 00000000..a4393bd1 --- /dev/null +++ b/src/sql/analysis/stream_rewriter.rs @@ -0,0 +1,234 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use super::StreamSchemaProvider; +use crate::sql::analysis::TimeWindowNullCheckRemover; +use crate::sql::analysis::row_time_rewriter::RowTimeRewriter; +use crate::sql::analysis::{ + aggregate_rewriter::AggregateRewriter, join_rewriter::JoinRewriter, + window_function_rewriter::WindowFunctionRewriter, +}; +use crate::sql::logical_node::StreamingOperatorBlueprint; +use crate::sql::logical_node::remote_table::RemoteTableBoundaryNode; +use crate::sql::schema::utils::{add_timestamp_field, has_timestamp_field}; +use crate::sql::types::{QualifiedField, TIMESTAMP_FIELD}; +use datafusion::common::tree_node::{Transformed, TreeNodeRewriter}; +use datafusion::common::{Column, DataFusionError, Result, Spans, TableReference, plan_err}; +use datafusion::logical_expr::{ + Expr, Extension, Filter, LogicalPlan, Projection, SubqueryAlias, Union, +}; +use datafusion_common::tree_node::TreeNode; +use datafusion_expr::{Aggregate, Join}; + +pub struct StreamRewriter<'a> { + pub(crate) schema_provider: &'a StreamSchemaProvider, +} + +impl TreeNodeRewriter for StreamRewriter<'_> { + type Node = LogicalPlan; + + fn f_up(&mut self, node: Self::Node) -> Result> { + match node { + // Logic Delegation + LogicalPlan::Projection(p) => self.rewrite_projection(p), + LogicalPlan::Filter(f) => self.rewrite_filter(f), + LogicalPlan::Union(u) => self.rewrite_union(u), + + // Delegation to specialized sub-rewriters + LogicalPlan::Aggregate(agg) => self.rewrite_aggregate(agg), + LogicalPlan::Join(join) => self.rewrite_join(join), + LogicalPlan::Window(_) => self.rewrite_window(node), + LogicalPlan::SubqueryAlias(sa) => self.rewrite_subquery_alias(sa), + + // Explicitly Unsupported Operations + LogicalPlan::Sort(_) => self.unsupported_error("ORDER BY", &node), + LogicalPlan::Limit(_) => self.unsupported_error("LIMIT", &node), + LogicalPlan::Repartition(_) => self.unsupported_error("Repartitions", &node), + LogicalPlan::Explain(_) => self.unsupported_error("EXPLAIN", &node), + LogicalPlan::Analyze(_) => self.unsupported_error("ANALYZE", &node), + + _ => Ok(Transformed::no(node)), + } + } +} + +impl<'a> StreamRewriter<'a> { + pub fn new(schema_provider: &'a StreamSchemaProvider) -> Self { + Self { schema_provider } + } + + /// Delegates to AggregateRewriter to transform batch aggregates into streaming stateful operators. + fn rewrite_aggregate(&self, agg: Aggregate) -> Result> { + AggregateRewriter { + schema_provider: self.schema_provider, + } + .f_up(LogicalPlan::Aggregate(agg)) + } + + /// Delegates to JoinRewriter to handle streaming join semantics (e.g., TTL, state management). + fn rewrite_join(&self, join: Join) -> Result> { + JoinRewriter { + schema_provider: self.schema_provider, + } + .f_up(LogicalPlan::Join(join)) + } + + /// Delegates to WindowFunctionRewriter for stream-aware windowing logic. + fn rewrite_window(&self, node: LogicalPlan) -> Result> { + WindowFunctionRewriter {}.f_up(node) + } + + /// Refreshes SubqueryAlias metadata to align with potentially rewritten internal schemas. + fn rewrite_subquery_alias(&self, sa: SubqueryAlias) -> Result> { + // Since the inner 'sa.input' has been rewritten (bottom-up), we must re-create + // the alias node to ensure the outer schema correctly reflects internal changes. + let new_sa = SubqueryAlias::try_new(sa.input, sa.alias).map_err(|e| { + DataFusionError::Internal(format!("Failed to re-alias subquery: {}", e)) + })?; + + Ok(Transformed::yes(LogicalPlan::SubqueryAlias(new_sa))) + } + + /// Handles timestamp propagation and row_time() mapping for Projections + fn rewrite_projection(&self, mut projection: Projection) -> Result> { + // Check if the current projection already has a timestamp field; + // if not, we must inject it to maintain streaming heartbeats. + if !has_timestamp_field(&projection.schema) { + let input_schema = projection.input.schema(); + + // Resolve the timestamp field from the input schema using the global constant. + let timestamp_field: QualifiedField = input_schema + .qualified_field_with_unqualified_name(TIMESTAMP_FIELD) + .map_err(|_| { + DataFusionError::Plan(format!( + "No timestamp field found in projection input ({})", + projection.input.display() + )) + })? + .into(); + + // Update the logical schema to include the newly injected timestamp. + projection.schema = add_timestamp_field( + projection.schema.clone(), + timestamp_field.qualifier().cloned(), + ) + .expect("Failed to add timestamp to projection schema"); + + // Physically push the timestamp column into the expression list. + projection.expr.push(Expr::Column(Column { + relation: timestamp_field.qualifier().cloned(), + name: TIMESTAMP_FIELD.to_string(), + spans: Spans::default(), + })); + } + + // Map user-friendly row_time() function calls to internal _timestamp column references. + let rewritten = projection + .expr + .iter() + .map(|expr| expr.clone().rewrite(&mut RowTimeRewriter {})) + .collect::>>()?; + + // If any expressions were modified (e.g., row_time() was replaced), update the projection. + if rewritten.iter().any(|r| r.transformed) { + projection.expr = rewritten.into_iter().map(|r| r.data).collect(); + } + + // Return the updated plan node wrapped in a Transformed container. + Ok(Transformed::yes(LogicalPlan::Projection(projection))) + } + + /// Harmonizes schemas across Union branches and wraps them in RemoteTableBoundaryNodes. + /// + /// This ensures that all inputs to a UNION operation share the exact same schema metadata, + /// preventing "Schema Drift" where different branches have different field qualifiers. + fn rewrite_union(&self, mut union: Union) -> Result> { + // Industrial engines use the first branch as the "Master Schema" for the Union. + // We clone it once to ensure all subsequent branches are forced to comply. + let master_schema = union.inputs[0].schema().clone(); + union.schema = master_schema.clone(); + + for input in union.inputs.iter_mut() { + // Optimization: If the node is already a non-transparent Extension, + // we skip wrapping to avoid unnecessary nesting of logical nodes. + if let LogicalPlan::Extension(Extension { node }) = input.as_ref() { + let stream_ext: &dyn StreamingOperatorBlueprint = node.try_into().map_err(|e| { + DataFusionError::Internal(format!( + "Failed to resolve StreamingOperatorBlueprint: {}", + e + )) + })?; + + if !stream_ext.is_passthrough_boundary() { + continue; + } + } + + // Wrap each branch in a RemoteTableBoundaryNode. + // This acts as a logical "bridge" that forces the input to adopt the master_schema, + // effectively stripping away branch-specific qualifiers (e.g., table aliases). + let remote_ext = Arc::new(RemoteTableBoundaryNode { + upstream_plan: input.as_ref().clone(), + table_identifier: TableReference::bare("union_input"), + resolved_schema: master_schema.clone(), + requires_materialization: false, // Internal logical boundary only; does not require physical sink. + }); + + // Atomically replace the input with the wrapped version. + *input = Arc::new(LogicalPlan::Extension(Extension { node: remote_ext })); + } + + Ok(Transformed::yes(LogicalPlan::Union(union))) + } + + /// Optimizes Filter nodes by stripping redundant NULL checks on time window expressions. + /// + /// In streaming SQL, DataFusion often injects 'IS NOT NULL' guards for window functions + /// that are redundant or can interfere with watermark propagation. This rewriter + /// cleans those predicates to simplify the physical execution plan. + fn rewrite_filter(&self, filter: Filter) -> Result> { + // We attempt to rewrite the predicate using a specialized sub-rewriter. + // The TimeWindowNullCheckRemover specifically targets expressions like + // `tumble(...) IS NOT NULL` and simplifies them to `TRUE`. + let rewritten_expr = filter + .predicate + .clone() + .rewrite(&mut TimeWindowNullCheckRemover {})?; + + if !rewritten_expr.transformed { + return Ok(Transformed::no(LogicalPlan::Filter(filter))); + } + + // Industrial Guard: Re-validate the predicate against the input schema. + // 'Filter::try_new' ensures that the transformed expression is still semantically + // valid for the underlying data stream. + let new_filter = Filter::try_new(rewritten_expr.data, filter.input).map_err(|e| { + DataFusionError::Internal(format!( + "Failed to re-validate filtered predicate after NULL-check removal: {}", + e + )) + })?; + + Ok(Transformed::yes(LogicalPlan::Filter(new_filter))) + } + + /// Centralized error handler for unsupported streaming operations + fn unsupported_error(&self, op: &str, node: &LogicalPlan) -> Result> { + plan_err!( + "{} is not currently supported in streaming SQL ({})", + op, + node.display() + ) + } +} diff --git a/src/sql/analysis/streaming_window_analzer.rs b/src/sql/analysis/streaming_window_analzer.rs new file mode 100644 index 00000000..b8a7f78f --- /dev/null +++ b/src/sql/analysis/streaming_window_analzer.rs @@ -0,0 +1,219 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashSet; +use std::sync::Arc; + +use datafusion::common::tree_node::{TreeNodeRecursion, TreeNodeVisitor}; +use datafusion::common::{Column, DFSchema, DataFusionError, Result}; +use datafusion::logical_expr::{Expr, Extension, LogicalPlan, expr::Alias}; + +use crate::sql::logical_node::aggregate::{STREAM_AGG_EXTENSION_NAME, StreamWindowAggregateNode}; +use crate::sql::logical_node::join::STREAM_JOIN_NODE_TYPE; +use crate::sql::types::{ + QualifiedField, WindowBehavior, WindowType, extract_qualified_fields, extract_window_type, +}; + +/// WindowDetectingVisitor identifies windowing strategies and tracks window-carrying fields +/// as they propagate upward through the logical plan tree. +#[derive(Debug, Default)] +pub(crate) struct StreamingWindowAnalzer { + /// The specific window type discovered (Tumble, Hop, etc.) + pub(crate) window: Option, + /// Set of fields in the current plan node that carry window semantics. + pub(crate) fields: HashSet, +} + +impl StreamingWindowAnalzer { + /// Entry point to resolve the WindowType of a given plan branch. + pub(crate) fn get_window(logical_plan: &LogicalPlan) -> Result> { + let mut visitor = Self::default(); + logical_plan.visit_with_subqueries(&mut visitor)?; + Ok(visitor.window) + } + + /// Resolves whether an expression is a reference to an existing window field + /// or a definition of a new window function. + fn resolve_window_from_expr( + &self, + expr: &Expr, + input_schema: &DFSchema, + ) -> Result> { + // 1. Check if the expression directly references a known window field. + if let Some(col) = extract_column(expr) { + let field = input_schema.field_with_name(col.relation.as_ref(), &col.name)?; + let df_field: QualifiedField = (col.relation.clone(), Arc::new(field.clone())).into(); + + if self.fields.contains(&df_field) { + return Ok(self.window.clone()); + } + } + + // 2. Otherwise, check if it's a new window function call (e.g., tumble(), hop()). + extract_window_type(expr) + } + + /// Updates the internal state with new window findings and maps them to the output schema. + fn update_state( + &mut self, + matched_windows: Vec<(usize, WindowType)>, + schema: &DFSchema, + ) -> Result<()> { + // Clear fields from the previous level to maintain schema strictly for the current node. + self.fields.clear(); + + for (index, window) in matched_windows { + if let Some(existing) = &self.window { + if existing != &window { + return Err(DataFusionError::Plan(format!( + "Conflicting windows in the same operator: expected {:?}, found {:?}", + existing, window + ))); + } + } else { + self.window = Some(window); + } + // Record this specific index in the schema as a window carrier. + self.fields.insert(schema.qualified_field(index).into()); + } + Ok(()) + } +} + +pub(crate) fn extract_column(expr: &Expr) -> Option<&Column> { + match expr { + Expr::Column(column) => Some(column), + Expr::Alias(Alias { expr, .. }) => extract_column(expr), + _ => None, + } +} + +impl TreeNodeVisitor<'_> for StreamingWindowAnalzer { + type Node = LogicalPlan; + + fn f_down(&mut self, node: &Self::Node) -> Result { + // Joins require cross-branch validation to ensure left and right sides align on time. + if let LogicalPlan::Extension(Extension { node }) = node + && node.name() == STREAM_JOIN_NODE_TYPE + { + let mut branch_windows = HashSet::new(); + for input in node.inputs() { + if let Some(w) = Self::get_window(input)? { + branch_windows.insert(w); + } + } + + if branch_windows.len() > 1 { + return Err(DataFusionError::Plan( + "Join inputs have mismatched windowing strategies.".into(), + )); + } + self.window = branch_windows.into_iter().next(); + + // Optimization: No need to recurse manually if we've resolved the join boundary. + return Ok(TreeNodeRecursion::Jump); + } + Ok(TreeNodeRecursion::Continue) + } + + fn f_up(&mut self, node: &Self::Node) -> Result { + match node { + LogicalPlan::Projection(p) => { + let windows = p + .expr + .iter() + .enumerate() + .filter_map(|(i, e)| { + self.resolve_window_from_expr(e, p.input.schema()) + .transpose() + .map(|res| res.map(|w| (i, w))) + }) + .collect::>>()?; + + self.update_state(windows, &p.schema)?; + } + + LogicalPlan::Aggregate(agg) => { + let windows = agg + .group_expr + .iter() + .enumerate() + .filter_map(|(i, e)| { + self.resolve_window_from_expr(e, agg.input.schema()) + .transpose() + .map(|res| res.map(|w| (i, w))) + }) + .collect::>>()?; + + self.update_state(windows, &agg.schema)?; + } + + LogicalPlan::SubqueryAlias(sa) => { + // Map fields through the alias layer by resolving column indices. + let input_schema = sa.input.schema(); + let mapped = self + .fields + .drain() + .map(|f| { + let idx = input_schema.index_of_column(&f.qualified_column())?; + Ok(sa.schema.qualified_field(idx).into()) + }) + .collect::>>()?; + + self.fields = mapped; + } + + LogicalPlan::Extension(Extension { node }) + if node.name() == STREAM_AGG_EXTENSION_NAME => + { + let ext = node + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Internal("StreamWindowAggregateNode is malformed".into()) + })?; + + match &ext.window_spec { + WindowBehavior::FromOperator { + window, + window_field, + is_nested, + .. + } => { + if self.window.is_some() && !*is_nested { + return Err(DataFusionError::Plan( + "Redundant window definition on an already windowed stream.".into(), + )); + } + self.window = Some(window.clone()); + self.fields.insert(window_field.clone()); + } + WindowBehavior::InData => { + let current_schema_fields: HashSet<_> = + extract_qualified_fields(node.schema()) + .into_iter() + .collect(); + self.fields.retain(|f| current_schema_fields.contains(f)); + + if self.fields.is_empty() { + return Err(DataFusionError::Plan( + "Windowed aggregate missing window metadata from its input.".into(), + )); + } + } + } + } + _ => {} + } + Ok(TreeNodeRecursion::Continue) + } +} diff --git a/src/sql/analysis/time_window.rs b/src/sql/analysis/time_window.rs new file mode 100644 index 00000000..104c0cca --- /dev/null +++ b/src/sql/analysis/time_window.rs @@ -0,0 +1,83 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion::common::tree_node::{ + Transformed, TreeNodeRecursion, TreeNodeRewriter, TreeNodeVisitor, +}; +use datafusion::common::{DataFusionError, Result as DFResult, ScalarValue, plan_err}; +use datafusion::logical_expr::expr::ScalarFunction; +use datafusion::logical_expr::{Expr, LogicalPlan}; + +/// Returns the time window function name if the expression is one (tumble/hop/session). +pub fn is_time_window(expr: &Expr) -> Option<&str> { + if let Expr::ScalarFunction(ScalarFunction { func, args: _ }) = expr { + match func.name() { + "tumble" | "hop" | "session" => return Some(func.name()), + _ => {} + } + } + None +} + +struct TimeWindowExprChecker {} + +impl TreeNodeVisitor<'_> for TimeWindowExprChecker { + type Node = Expr; + + fn f_down(&mut self, node: &Self::Node) -> DFResult { + if let Some(w) = is_time_window(node) { + return plan_err!( + "time window function {} is not allowed in this context. \ + Are you missing a GROUP BY clause?", + w + ); + } + Ok(TreeNodeRecursion::Continue) + } +} + +/// Visitor that checks an entire LogicalPlan for misplaced time window UDFs. +pub struct TimeWindowUdfChecker {} + +impl TreeNodeVisitor<'_> for TimeWindowUdfChecker { + type Node = LogicalPlan; + + fn f_down(&mut self, node: &Self::Node) -> DFResult { + use datafusion::common::tree_node::TreeNode; + node.expressions().iter().try_for_each(|expr| { + let mut checker = TimeWindowExprChecker {}; + expr.visit(&mut checker)?; + Ok::<(), DataFusionError>(()) + })?; + Ok(TreeNodeRecursion::Continue) + } +} + +/// Removes `IS NOT NULL` checks wrapping time window functions, +/// replacing them with `true` since time windows are never null. +pub struct TimeWindowNullCheckRemover {} + +impl TreeNodeRewriter for TimeWindowNullCheckRemover { + type Node = Expr; + + fn f_down(&mut self, node: Self::Node) -> DFResult> { + if let Expr::IsNotNull(expr) = &node + && is_time_window(expr).is_some() + { + return Ok(Transformed::yes(Expr::Literal( + ScalarValue::Boolean(Some(true)), + None, + ))); + } + Ok(Transformed::no(node)) + } +} diff --git a/src/sql/analysis/udafs.rs b/src/sql/analysis/udafs.rs new file mode 100644 index 00000000..73fc062c --- /dev/null +++ b/src/sql/analysis/udafs.rs @@ -0,0 +1,43 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion::arrow::array::ArrayRef; +use datafusion::error::Result; +use datafusion::physical_plan::Accumulator; +use datafusion::scalar::ScalarValue; +use std::fmt::Debug; + +/// Fake UDAF used just for plan-time placeholder. +#[derive(Debug)] +pub struct EmptyUdaf {} + +impl Accumulator for EmptyUdaf { + fn update_batch(&mut self, _: &[ArrayRef]) -> Result<()> { + unreachable!() + } + + fn evaluate(&self) -> Result { + unreachable!() + } + + fn size(&self) -> usize { + unreachable!() + } + + fn state(&self) -> Result> { + unreachable!() + } + + fn merge_batch(&mut self, _: &[ArrayRef]) -> Result<()> { + unreachable!() + } +} diff --git a/src/sql/analysis/unnest_rewriter.rs b/src/sql/analysis/unnest_rewriter.rs new file mode 100644 index 00000000..147b1f49 --- /dev/null +++ b/src/sql/analysis/unnest_rewriter.rs @@ -0,0 +1,179 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use datafusion::arrow::datatypes::DataType; +use datafusion::common::tree_node::{Transformed, TreeNode, TreeNodeRewriter}; +use datafusion::common::{Column, Result as DFResult, plan_err}; +use datafusion::logical_expr::expr::ScalarFunction; +use datafusion::logical_expr::{ColumnUnnestList, Expr, LogicalPlan, Projection, Unnest}; + +use crate::sql::common::constants::planning_placeholder_udf; +use crate::sql::types::{QualifiedField, build_df_schema, extract_qualified_fields}; + +pub const UNNESTED_COL: &str = "__unnested"; + +/// Rewrites projections containing `unnest()` calls into proper Unnest logical plans. +pub struct UnnestRewriter {} + +impl UnnestRewriter { + fn split_unnest(expr: Expr) -> DFResult<(Expr, Option)> { + let mut captured: Option = None; + + let expr = expr.transform_up(|e| { + if let Expr::ScalarFunction(ScalarFunction { func: udf, args }) = &e + && udf.name() == planning_placeholder_udf::UNNEST + { + match args.len() { + 1 => { + if captured.replace(args[0].clone()).is_some() { + return plan_err!( + "Multiple unnests in expression, which is not allowed" + ); + } + return Ok(Transformed::yes(Expr::Column(Column::new_unqualified( + UNNESTED_COL, + )))); + } + n => { + panic!("Unnest has wrong number of arguments (expected 1, found {n})"); + } + } + } + Ok(Transformed::no(e)) + })?; + + Ok((expr.data, captured)) + } +} + +impl TreeNodeRewriter for UnnestRewriter { + type Node = LogicalPlan; + + fn f_up(&mut self, node: Self::Node) -> DFResult> { + let LogicalPlan::Projection(projection) = &node else { + if node.expressions().iter().any(|e| { + let e = Self::split_unnest(e.clone()); + e.is_err() || e.unwrap().1.is_some() + }) { + return plan_err!("unnest is only supported in SELECT statements"); + } + return Ok(Transformed::no(node)); + }; + + let mut unnest = None; + let exprs = projection + .expr + .clone() + .into_iter() + .enumerate() + .map(|(i, expr)| { + let (expr, opt) = Self::split_unnest(expr)?; + let is_unnest = if let Some(e) = opt { + if let Some(prev) = unnest.replace((e, i)) + && &prev != unnest.as_ref().unwrap() + { + return plan_err!( + "Projection contains multiple unnests, which is not currently supported" + ); + } + true + } else { + false + }; + + Ok((expr, is_unnest)) + }) + .collect::>>()?; + + if let Some((unnest_inner, unnest_idx)) = unnest { + let produce_list = Arc::new(LogicalPlan::Projection(Projection::try_new( + exprs + .iter() + .cloned() + .map(|(e, is_unnest)| { + if is_unnest { + unnest_inner.clone().alias(UNNESTED_COL) + } else { + e + } + }) + .collect(), + projection.input.clone(), + )?)); + + let unnest_fields = extract_qualified_fields(produce_list.schema()) + .iter() + .enumerate() + .map(|(i, f)| { + if i == unnest_idx { + let DataType::List(inner) = f.data_type() else { + return plan_err!( + "Argument '{}' to unnest is not a List", + f.qualified_name() + ); + }; + Ok(QualifiedField::new_unqualified( + UNNESTED_COL, + inner.data_type().clone(), + inner.is_nullable(), + )) + } else { + Ok((*f).clone()) + } + }) + .collect::>>()?; + + let unnest_node = LogicalPlan::Unnest(Unnest { + exec_columns: vec![ + QualifiedField::from(produce_list.schema().qualified_field(unnest_idx)) + .qualified_column(), + ], + input: produce_list, + list_type_columns: vec![( + unnest_idx, + ColumnUnnestList { + output_column: Column::new_unqualified(UNNESTED_COL), + depth: 1, + }, + )], + struct_type_columns: vec![], + dependency_indices: vec![], + schema: Arc::new(build_df_schema(&unnest_fields)?), + options: Default::default(), + }); + + let output_node = LogicalPlan::Projection(Projection::try_new( + exprs + .iter() + .enumerate() + .map(|(i, (expr, has_unnest))| { + if *has_unnest { + expr.clone() + } else { + Expr::Column( + QualifiedField::from(unnest_node.schema().qualified_field(i)) + .qualified_column(), + ) + } + }) + .collect(), + Arc::new(unnest_node), + )?); + + Ok(Transformed::yes(output_node)) + } else { + Ok(Transformed::no(LogicalPlan::Projection(projection.clone()))) + } + } +} diff --git a/src/sql/analysis/window_function_rewriter.rs b/src/sql/analysis/window_function_rewriter.rs new file mode 100644 index 00000000..c1e3396d --- /dev/null +++ b/src/sql/analysis/window_function_rewriter.rs @@ -0,0 +1,204 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion::common::tree_node::Transformed; +use datafusion::common::{Result as DFResult, plan_err, tree_node::TreeNodeRewriter}; +use datafusion::logical_expr::{ + self, Expr, Extension, LogicalPlan, Projection, Sort, Window, expr::WindowFunction, + expr::WindowFunctionParams, +}; +use datafusion_common::DataFusionError; +use std::sync::Arc; +use tracing::debug; + +use crate::sql::analysis::streaming_window_analzer::{StreamingWindowAnalzer, extract_column}; +use crate::sql::logical_node::key_calculation::{KeyExtractionNode, KeyExtractionStrategy}; +use crate::sql::logical_node::windows_function::StreamingWindowFunctionNode; +use crate::sql::types::{WindowType, build_df_schema, extract_qualified_fields}; + +/// WindowFunctionRewriter transforms standard SQL Window functions into streaming-compatible +/// stateful operators, ensuring proper data partitioning and sorting for distributed execution. +pub(crate) struct WindowFunctionRewriter; + +impl WindowFunctionRewriter { + /// Recursively unwraps Aliases to find the underlying WindowFunction. + #[allow(clippy::only_used_in_recursion)] + fn resolve_window_function(&self, expr: &Expr) -> DFResult<(WindowFunction, String)> { + match expr { + Expr::Alias(alias) => { + let (func, _) = self.resolve_window_function(&alias.expr)?; + Ok((func, alias.name.clone())) + } + Expr::WindowFunction(wf) => Ok((wf.as_ref().clone(), expr.name_for_alias()?)), + _ => plan_err!("Expected WindowFunction or Alias, found: {:?}", expr), + } + } + + /// Identifies which field in the PARTITION BY clause corresponds to the streaming window. + fn identify_window_partition( + &self, + params: &WindowFunctionParams, + input: &LogicalPlan, + input_window_fields: &std::collections::HashSet, + ) -> DFResult { + let matched: Vec<_> = params + .partition_by + .iter() + .enumerate() + .filter_map(|(i, e)| { + let col = extract_column(e)?; + let field = input + .schema() + .field_with_name(col.relation.as_ref(), &col.name) + .ok()?; + let df_field = (col.relation.clone(), Arc::new(field.clone())).into(); + + if input_window_fields.contains(&df_field) { + Some(i) + } else { + None + } + }) + .collect(); + + if matched.len() != 1 { + return plan_err!( + "Streaming window functions require exactly one window column in PARTITION BY. Found: {}", + matched.len() + ); + } + Ok(matched[0]) + } + + /// Wraps the input in a Projection and KeyExtractionNode to handle data distribution. + fn build_keyed_input( + &self, + input: Arc, + partition_keys: &[Expr], + ) -> DFResult { + let key_count = partition_keys.len(); + + // 1. Build projection: [_key_0, _key_1, ..., original_columns] + let mut exprs: Vec<_> = partition_keys + .iter() + .enumerate() + .map(|(i, e)| e.clone().alias(format!("_key_{i}"))) + .collect(); + + exprs.extend( + extract_qualified_fields(input.schema()) + .iter() + .map(|f| Expr::Column(f.qualified_column())), + ); + + // 2. Derive the keyed schema + let mut keyed_fields = + extract_qualified_fields(&Projection::try_new(exprs.clone(), input.clone())?.schema) + .iter() + .take(key_count) + .cloned() + .collect::>(); + keyed_fields.extend(extract_qualified_fields(input.schema())); + + let keyed_schema = Arc::new(build_df_schema(&keyed_fields)?); + + let projection = + LogicalPlan::Projection(Projection::try_new_with_schema(exprs, input, keyed_schema)?); + + // 3. Wrap in KeyExtractionNode for the physical planner + Ok(LogicalPlan::Extension(Extension { + node: Arc::new(KeyExtractionNode::new( + projection, + KeyExtractionStrategy::ColumnIndices((0..key_count).collect()), + )), + })) + } +} + +impl TreeNodeRewriter for WindowFunctionRewriter { + type Node = LogicalPlan; + + fn f_up(&mut self, node: Self::Node) -> DFResult> { + let LogicalPlan::Window(window) = node else { + return Ok(Transformed::no(node)); + }; + + debug!("Rewriting window function for streaming: {:?}", window); + + // 1. Analyze input windowing context + let mut analyzer = StreamingWindowAnalzer::default(); + window.input.visit_with_subqueries(&mut analyzer)?; + + let input_window = analyzer.window.ok_or_else(|| { + DataFusionError::Plan( + "Window functions require a windowed input stream (e.g., TUMBLE/HOP)".into(), + ) + })?; + + if matches!(input_window, WindowType::Session { .. }) { + return plan_err!( + "Streaming window functions (OVER) are not supported on Session windows." + ); + } + + // 2. Validate window expression constraints + if window.window_expr.len() != 1 { + return plan_err!( + "Arroyo currently supports exactly one window expression per OVER clause." + ); + } + + let (mut wf, original_name) = self.resolve_window_function(&window.window_expr[0])?; + + // 3. Identify and extract the window column from PARTITION BY + let window_part_idx = + self.identify_window_partition(&wf.params, &window.input, &analyzer.fields)?; + let mut partition_keys = wf.params.partition_by.clone(); + partition_keys.remove(window_part_idx); + + // Update function params to exclude the window column from internal partitioning + // as the streaming engine handles window boundaries natively. + wf.params.partition_by = partition_keys.clone(); + let key_count = partition_keys.len(); + + // 4. Build the data-shuffling pipeline (Projection -> KeyCalc -> Sort) + let keyed_plan = self.build_keyed_input(window.input.clone(), &partition_keys)?; + + let mut sort_exprs: Vec<_> = partition_keys + .iter() + .map(|e| logical_expr::expr::Sort { + expr: e.clone(), + asc: true, + nulls_first: false, + }) + .collect(); + sort_exprs.extend(wf.params.order_by.clone()); + + let sorted_plan = LogicalPlan::Sort(Sort { + expr: sort_exprs, + input: Arc::new(keyed_plan), + fetch: None, + }); + + // 5. Final Assembly + let final_wf_expr = Expr::WindowFunction(Box::new(wf)).alias_if_changed(original_name)?; + let rewritten_window = + LogicalPlan::Window(Window::try_new(vec![final_wf_expr], Arc::new(sorted_plan))?); + + Ok(Transformed::yes(LogicalPlan::Extension(Extension { + node: Arc::new(StreamingWindowFunctionNode::new( + rewritten_window, + (0..key_count).collect(), + )), + }))) + } +} diff --git a/src/sql/api/checkpoints.rs b/src/sql/api/checkpoints.rs new file mode 100644 index 00000000..d9bdc139 --- /dev/null +++ b/src/sql/api/checkpoints.rs @@ -0,0 +1,108 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::sql::common::to_micros; +use serde::{Deserialize, Serialize}; +use std::time::SystemTime; + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct Checkpoint { + pub epoch: u32, + pub backend: String, + pub start_time: u64, + pub finish_time: Option, + pub events: Vec, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct CheckpointEventSpan { + pub start_time: u64, + pub finish_time: u64, + pub event: String, + pub description: String, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct SubtaskCheckpointGroup { + pub index: u32, + pub bytes: u64, + pub event_spans: Vec, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct OperatorCheckpointGroup { + pub operator_id: String, + pub bytes: u64, + pub started_metadata_write: Option, + pub finish_time: Option, + pub subtasks: Vec, +} + +#[derive(Debug, Copy, Clone, Eq, PartialEq, Serialize, Deserialize)] +pub enum JobCheckpointEventType { + Checkpointing, + CheckpointingOperators, + WritingMetadata, + Compacting, + Committing, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct JobCheckpointSpan { + pub event: JobCheckpointEventType, + pub start_time: u64, + pub finish_time: Option, +} + +impl JobCheckpointSpan { + pub fn now(event: JobCheckpointEventType) -> Self { + Self { + event, + start_time: to_micros(SystemTime::now()), + finish_time: None, + } + } + + pub fn finish(&mut self) { + if self.finish_time.is_none() { + self.finish_time = Some(to_micros(SystemTime::now())); + } + } +} + +impl From for CheckpointEventSpan { + fn from(value: JobCheckpointSpan) -> Self { + let description = match value.event { + JobCheckpointEventType::Checkpointing => "The entire checkpointing process", + JobCheckpointEventType::CheckpointingOperators => { + "The time spent checkpointing operator states" + } + JobCheckpointEventType::WritingMetadata => "Writing the final checkpoint metadata", + JobCheckpointEventType::Compacting => "Compacting old checkpoints", + JobCheckpointEventType::Committing => { + "Running two-phase commit for transactional connectors" + } + } + .to_string(); + + Self { + start_time: value.start_time, + finish_time: value.finish_time.unwrap_or_default(), + event: format!("{:?}", value.event), + description, + } + } +} diff --git a/src/sql/api/connections.rs b/src/sql/api/connections.rs new file mode 100644 index 00000000..3c5caf76 --- /dev/null +++ b/src/sql/api/connections.rs @@ -0,0 +1,620 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::sql::common::formats::{BadData, Format, Framing}; +use crate::sql::common::{FsExtensionType, FsSchema}; +use datafusion::arrow::datatypes::{DataType, Field, Fields, TimeUnit}; +use serde::ser::SerializeMap; +use serde::{Deserialize, Serialize, Serializer}; +use std::collections::{BTreeMap, HashMap, HashSet}; +use std::fmt::{Display, Formatter}; +use std::sync::Arc; + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct Connector { + pub id: String, + pub name: String, + pub icon: String, + pub description: String, + pub table_config: String, + pub enabled: bool, + pub source: bool, + pub sink: bool, + pub custom_schemas: bool, + pub testing: bool, + pub hidden: bool, + pub connection_config: Option, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct ConnectionProfile { + pub id: String, + pub name: String, + pub connector: String, + pub config: serde_json::Value, + pub description: String, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct ConnectionProfilePost { + pub name: String, + pub connector: String, + pub config: serde_json::Value, +} + +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[serde(rename_all = "snake_case")] +pub enum ConnectionType { + Source, + Sink, + Lookup, +} + +impl Display for ConnectionType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + ConnectionType::Source => write!(f, "SOURCE"), + ConnectionType::Sink => write!(f, "SINK"), + ConnectionType::Lookup => write!(f, "LOOKUP"), + } + } +} + +impl TryFrom for ConnectionType { + type Error = String; + + fn try_from(value: String) -> Result { + match value.to_lowercase().as_str() { + "source" => Ok(ConnectionType::Source), + "sink" => Ok(ConnectionType::Sink), + "lookup" => Ok(ConnectionType::Lookup), + _ => Err(format!("Invalid connection type: {value}")), + } + } +} + +// ─────────────────── Field Types ─────────────────── + +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] +#[serde(tag = "type", rename_all = "snake_case")] +pub enum FieldType { + Int32, + Int64, + Uint32, + Uint64, + #[serde(alias = "f32")] + Float32, + #[serde(alias = "f64")] + Float64, + Decimal128(DecimalField), + Bool, + #[serde(alias = "utf8")] + String, + #[serde(alias = "binary")] + Bytes, + Timestamp(TimestampField), + Json, + Struct(StructField), + List(ListField), +} + +impl FieldType { + pub fn sql_type(&self) -> String { + match self { + FieldType::Int32 => "INTEGER".into(), + FieldType::Int64 => "BIGINT".into(), + FieldType::Uint32 => "INTEGER UNSIGNED".into(), + FieldType::Uint64 => "BIGINT UNSIGNED".into(), + FieldType::Float32 => "FLOAT".into(), + FieldType::Float64 => "DOUBLE".into(), + FieldType::Decimal128(f) => format!("DECIMAL({}, {})", f.precision, f.scale), + FieldType::Bool => "BOOLEAN".into(), + FieldType::String => "TEXT".into(), + FieldType::Bytes => "BINARY".into(), + FieldType::Timestamp(t) => format!("TIMESTAMP({})", t.unit.precision()), + FieldType::Json => "JSON".into(), + FieldType::List(item) => format!("{}[]", item.items.field_type.sql_type()), + FieldType::Struct(StructField { fields, .. }) => { + format!( + "STRUCT <{}>", + fields + .iter() + .map(|f| format!("{} {}", f.name, f.field_type.sql_type())) + .collect::>() + .join(", ") + ) + } + } + } +} + +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, Default)] +#[serde(rename_all = "snake_case")] +pub enum TimestampUnit { + #[serde(alias = "s")] + Second, + #[default] + #[serde(alias = "ms")] + Millisecond, + #[serde(alias = "µs", alias = "us")] + Microsecond, + #[serde(alias = "ns")] + Nanosecond, +} + +impl TimestampUnit { + pub fn precision(&self) -> u8 { + match self { + TimestampUnit::Second => 0, + TimestampUnit::Millisecond => 3, + TimestampUnit::Microsecond => 6, + TimestampUnit::Nanosecond => 9, + } + } +} + +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub struct TimestampField { + #[serde(default)] + pub unit: TimestampUnit, +} + +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub struct DecimalField { + pub precision: u8, + pub scale: i8, +} + +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub struct StructField { + pub fields: Vec, +} + +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub struct ListField { + pub items: Box, +} + +fn default_item_name() -> String { + "item".to_string() +} + +#[derive(Deserialize, Clone, Debug, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub struct ListFieldItem { + #[serde(default = "default_item_name")] + pub name: String, + #[serde(flatten)] + pub field_type: FieldType, + #[serde(default)] + pub required: bool, + #[serde(default)] + pub sql_name: Option, +} + +impl From for Field { + fn from(value: ListFieldItem) -> Self { + SourceField { + name: value.name, + field_type: value.field_type, + required: value.required, + sql_name: None, + metadata_key: None, + } + .into() + } +} + +impl Serialize for ListFieldItem { + fn serialize(&self, s: S) -> Result + where + S: Serializer, + { + let mut f = Serializer::serialize_map(s, None)?; + f.serialize_entry("name", &self.name)?; + serialize_field_type_flat(&self.field_type, &mut f)?; + f.serialize_entry("required", &self.required)?; + f.serialize_entry("sql_name", &self.field_type.sql_type())?; + f.end() + } +} + +impl TryFrom for ListFieldItem { + type Error = String; + + fn try_from(value: Field) -> Result { + let source_field: SourceField = value.try_into()?; + Ok(Self { + name: source_field.name, + field_type: source_field.field_type, + required: source_field.required, + sql_name: None, + }) + } +} + +fn serialize_field_type_flat(ft: &FieldType, map: &mut M) -> Result<(), M::Error> { + let type_tag = match ft { + FieldType::Int32 => "int32", + FieldType::Int64 => "int64", + FieldType::Uint32 => "uint32", + FieldType::Uint64 => "uint64", + FieldType::Float32 => "float32", + FieldType::Float64 => "float64", + FieldType::Decimal128(_) => "decimal128", + FieldType::Bool => "bool", + FieldType::String => "string", + FieldType::Bytes => "bytes", + FieldType::Timestamp(_) => "timestamp", + FieldType::Json => "json", + FieldType::Struct(_) => "struct", + FieldType::List(_) => "list", + }; + map.serialize_entry("type", type_tag)?; + + match ft { + FieldType::Decimal128(d) => { + map.serialize_entry("precision", &d.precision)?; + map.serialize_entry("scale", &d.scale)?; + } + FieldType::Timestamp(t) => { + map.serialize_entry("unit", &t.unit)?; + } + FieldType::Struct(s) => { + map.serialize_entry("fields", &s.fields)?; + } + FieldType::List(l) => { + map.serialize_entry("items", &l.items)?; + } + _ => {} + } + Ok(()) +} + +// ─────────────────── Source Field ─────────────────── + +#[derive(Deserialize, Clone, Debug, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub struct SourceField { + pub name: String, + #[serde(flatten)] + pub field_type: FieldType, + #[serde(default)] + pub required: bool, + #[serde(default)] + pub sql_name: Option, + #[serde(default)] + pub metadata_key: Option, +} + +impl Serialize for SourceField { + fn serialize(&self, s: S) -> Result + where + S: Serializer, + { + let mut f = Serializer::serialize_map(s, None)?; + f.serialize_entry("name", &self.name)?; + serialize_field_type_flat(&self.field_type, &mut f)?; + f.serialize_entry("required", &self.required)?; + if let Some(metadata_key) = &self.metadata_key { + f.serialize_entry("metadata_key", metadata_key)?; + } + f.serialize_entry("sql_name", &self.field_type.sql_type())?; + f.end() + } +} + +impl From for Field { + fn from(f: SourceField) -> Self { + let (t, ext) = match f.field_type { + FieldType::Int32 => (DataType::Int32, None), + FieldType::Int64 => (DataType::Int64, None), + FieldType::Uint32 => (DataType::UInt32, None), + FieldType::Uint64 => (DataType::UInt64, None), + FieldType::Float32 => (DataType::Float32, None), + FieldType::Float64 => (DataType::Float64, None), + FieldType::Bool => (DataType::Boolean, None), + FieldType::String => (DataType::Utf8, None), + FieldType::Bytes => (DataType::Binary, None), + FieldType::Decimal128(d) => (DataType::Decimal128(d.precision, d.scale), None), + FieldType::Timestamp(TimestampField { + unit: TimestampUnit::Second, + }) => (DataType::Timestamp(TimeUnit::Second, None), None), + FieldType::Timestamp(TimestampField { + unit: TimestampUnit::Millisecond, + }) => (DataType::Timestamp(TimeUnit::Millisecond, None), None), + FieldType::Timestamp(TimestampField { + unit: TimestampUnit::Microsecond, + }) => (DataType::Timestamp(TimeUnit::Microsecond, None), None), + FieldType::Timestamp(TimestampField { + unit: TimestampUnit::Nanosecond, + }) => (DataType::Timestamp(TimeUnit::Nanosecond, None), None), + FieldType::Json => (DataType::Utf8, Some(FsExtensionType::JSON)), + FieldType::Struct(s) => ( + DataType::Struct(Fields::from( + s.fields + .into_iter() + .map(|t| t.into()) + .collect::>(), + )), + None, + ), + FieldType::List(t) => (DataType::List(Arc::new((*t.items).into())), None), + }; + + FsExtensionType::add_metadata(ext, Field::new(f.name, t, !f.required)) + } +} + +impl TryFrom for SourceField { + type Error = String; + + fn try_from(f: Field) -> Result { + let field_type = match (f.data_type(), FsExtensionType::from_map(f.metadata())) { + (DataType::Boolean, None) => FieldType::Bool, + (DataType::Int32, None) => FieldType::Int32, + (DataType::Int64, None) => FieldType::Int64, + (DataType::UInt32, None) => FieldType::Uint32, + (DataType::UInt64, None) => FieldType::Uint64, + (DataType::Float32, None) => FieldType::Float32, + (DataType::Float64, None) => FieldType::Float64, + (DataType::Decimal128(p, s), None) => FieldType::Decimal128(DecimalField { + precision: *p, + scale: *s, + }), + (DataType::Binary | DataType::LargeBinary | DataType::BinaryView, None) => { + FieldType::Bytes + } + (DataType::Timestamp(TimeUnit::Second, _), None) => { + FieldType::Timestamp(TimestampField { + unit: TimestampUnit::Second, + }) + } + (DataType::Timestamp(TimeUnit::Millisecond, _), None) => { + FieldType::Timestamp(TimestampField { + unit: TimestampUnit::Millisecond, + }) + } + (DataType::Timestamp(TimeUnit::Microsecond, _), None) => { + FieldType::Timestamp(TimestampField { + unit: TimestampUnit::Microsecond, + }) + } + (DataType::Timestamp(TimeUnit::Nanosecond, _), None) => { + FieldType::Timestamp(TimestampField { + unit: TimestampUnit::Nanosecond, + }) + } + (DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View, None) => FieldType::String, + ( + DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View, + Some(FsExtensionType::JSON), + ) => FieldType::Json, + (DataType::Struct(fields), None) => { + let fields: Result<_, String> = fields + .into_iter() + .map(|f| (**f).clone().try_into()) + .collect(); + FieldType::Struct(StructField { fields: fields? }) + } + (DataType::List(item), None) => FieldType::List(ListField { + items: Box::new((**item).clone().try_into()?), + }), + dt => return Err(format!("Unsupported data type {dt:?}")), + }; + + Ok(SourceField { + name: f.name().clone(), + field_type, + required: !f.is_nullable(), + sql_name: None, + metadata_key: None, + }) + } +} + +// ─────────────────── Schema Definitions ─────────────────── + +#[allow(clippy::enum_variant_names)] +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)] +#[serde(rename_all = "snake_case", tag = "type")] +pub enum SchemaDefinition { + JsonSchema { + schema: String, + }, + ProtobufSchema { + schema: String, + #[serde(default)] + dependencies: HashMap, + }, + AvroSchema { + schema: String, + }, +} + +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)] +#[serde(rename_all = "snake_case")] +pub struct ConnectionSchema { + pub format: Option, + #[serde(default)] + pub bad_data: Option, + #[serde(default)] + pub framing: Option, + #[serde(default)] + pub fields: Vec, + #[serde(default)] + pub definition: Option, + #[serde(default)] + pub inferred: Option, + #[serde(default)] + pub primary_keys: HashSet, +} + +impl ConnectionSchema { + pub fn try_new( + format: Option, + bad_data: Option, + framing: Option, + fields: Vec, + definition: Option, + inferred: Option, + primary_keys: HashSet, + ) -> anyhow::Result { + let s = ConnectionSchema { + format, + bad_data, + framing, + fields, + definition, + inferred, + primary_keys, + }; + s.validate() + } + + pub fn validate(self) -> anyhow::Result { + let non_metadata_fields: Vec<_> = self + .fields + .iter() + .filter(|f| f.metadata_key.is_none()) + .collect(); + + if let Some(Format::RawString(_)) = &self.format + && (non_metadata_fields.len() != 1 + || non_metadata_fields.first().unwrap().field_type != FieldType::String + || non_metadata_fields.first().unwrap().name != "value") + { + anyhow::bail!( + "raw_string format requires a schema with a single field called `value` of type TEXT" + ); + } + + if let Some(Format::Json(json_format)) = &self.format + && json_format.unstructured + && (non_metadata_fields.len() != 1 + || non_metadata_fields.first().unwrap().field_type != FieldType::Json + || non_metadata_fields.first().unwrap().name != "value") + { + anyhow::bail!( + "json format with unstructured flag enabled requires a schema with a single field called `value` of type JSON" + ); + } + + Ok(self) + } + + pub fn fs_schema(&self) -> Arc { + let fields: Vec = self.fields.iter().map(|f| f.clone().into()).collect(); + Arc::new(FsSchema::from_fields(fields)) + } +} + +impl From for FsSchema { + fn from(val: ConnectionSchema) -> Self { + let fields: Vec = val.fields.into_iter().map(|f| f.into()).collect(); + FsSchema::from_fields(fields) + } +} + +// ─────────────────── Connection Table ─────────────────── + +#[derive(Serialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct ConnectionTable { + #[serde(skip_serializing)] + pub id: i64, + #[serde(rename = "id")] + pub pub_id: String, + pub name: String, + pub created_at: u64, + pub connector: String, + pub connection_profile: Option, + pub table_type: ConnectionType, + pub config: serde_json::Value, + pub schema: ConnectionSchema, + pub consumers: u32, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct ConnectionTablePost { + pub name: String, + pub connector: String, + pub connection_profile_id: Option, + pub config: serde_json::Value, + pub schema: Option, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct ConnectionAutocompleteResp { + pub values: BTreeMap>, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct TestSourceMessage { + pub error: bool, + pub done: bool, + pub message: String, +} + +impl TestSourceMessage { + pub fn info(message: impl Into) -> Self { + Self { + error: false, + done: false, + message: message.into(), + } + } + pub fn error(message: impl Into) -> Self { + Self { + error: true, + done: false, + message: message.into(), + } + } + pub fn done(message: impl Into) -> Self { + Self { + error: false, + done: true, + message: message.into(), + } + } + pub fn fail(message: impl Into) -> Self { + Self { + error: true, + done: true, + message: message.into(), + } + } +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct ConfluentSchema { + pub schema: String, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct ConfluentSchemaQueryParams { + pub endpoint: String, + pub topic: String, +} diff --git a/src/sql/api/metrics.rs b/src/sql/api/metrics.rs new file mode 100644 index 00000000..671b52f6 --- /dev/null +++ b/src/sql/api/metrics.rs @@ -0,0 +1,53 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use serde::{Deserialize, Serialize}; + +#[derive(Serialize, Deserialize, Copy, Clone, Debug, Hash, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum MetricName { + BytesRecv, + BytesSent, + MessagesRecv, + MessagesSent, + Backpressure, + TxQueueSize, + TxQueueRem, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct Metric { + pub time: u64, + pub value: f64, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct SubtaskMetrics { + pub index: u32, + pub metrics: Vec, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct MetricGroup { + pub name: MetricName, + pub subtasks: Vec, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct OperatorMetricGroup { + pub node_id: u32, + pub metric_groups: Vec, +} diff --git a/src/sql/api/mod.rs b/src/sql/api/mod.rs new file mode 100644 index 00000000..cdc119b7 --- /dev/null +++ b/src/sql/api/mod.rs @@ -0,0 +1,48 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! REST/RPC API types for the FunctionStream system. +//! +//! Adapted from Arroyo's `arroyo-rpc/src/api_types` and utility modules. + +pub mod checkpoints; +pub mod connections; +pub mod metrics; +pub mod pipelines; +pub mod public_ids; +pub mod schema_resolver; +pub mod udfs; +pub mod var_str; + +use serde::{Deserialize, Serialize}; + +pub use connections::ConnectionProfile; + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "camelCase")] +pub struct PaginatedCollection { + pub data: Vec, + pub has_more: bool, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "camelCase")] +pub struct NonPaginatedCollection { + pub data: Vec, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct PaginationQueryParams { + pub starting_after: Option, + pub limit: Option, +} diff --git a/src/sql/api/pipelines.rs b/src/sql/api/pipelines.rs new file mode 100644 index 00000000..d6cc5253 --- /dev/null +++ b/src/sql/api/pipelines.rs @@ -0,0 +1,168 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::udfs::Udf; +use crate::sql::common::control::ErrorDomain; +use serde::{Deserialize, Serialize}; + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct ValidateQueryPost { + pub query: String, + pub udfs: Option>, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct QueryValidationResult { + pub graph: Option, + pub errors: Vec, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct PipelinePost { + pub name: String, + pub query: String, + pub udfs: Option>, + pub parallelism: u64, + pub checkpoint_interval_micros: Option, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct PreviewPost { + pub query: String, + pub udfs: Option>, + #[serde(default)] + pub enable_sinks: bool, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct PipelinePatch { + pub parallelism: Option, + pub checkpoint_interval_micros: Option, + pub stop: Option, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct PipelineRestart { + pub force: Option, + pub ignore_state: Option, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct Pipeline { + pub id: String, + pub name: String, + pub query: String, + pub udfs: Vec, + pub checkpoint_interval_micros: u64, + pub stop: StopType, + pub created_at: u64, + pub action: Option, + pub action_text: String, + pub action_in_progress: bool, + pub graph: PipelineGraph, + pub preview: bool, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct PipelineGraph { + pub nodes: Vec, + pub edges: Vec, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct PipelineNode { + pub node_id: u32, + pub operator: String, + pub description: String, + pub parallelism: u32, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct PipelineEdge { + pub src_id: u32, + pub dest_id: u32, + pub key_type: String, + pub value_type: String, + pub edge_type: String, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub enum StopType { + None, + Checkpoint, + Graceful, + Immediate, + Force, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct FailureReason { + pub error: String, + pub domain: ErrorDomain, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct Job { + pub id: String, + pub running_desired: bool, + pub state: String, + pub run_id: u64, + pub start_time: Option, + pub finish_time: Option, + pub tasks: Option, + pub failure_reason: Option, + pub created_at: u64, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub enum JobLogLevel { + Info, + Warn, + Error, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct JobLogMessage { + pub id: String, + pub created_at: u64, + pub operator_id: Option, + pub task_index: Option, + pub level: JobLogLevel, + pub message: String, + pub details: String, + pub error_domain: Option, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct OutputData { + pub operator_id: String, + pub subtask_idx: u32, + pub timestamps: Vec, + pub start_id: u64, + pub batch: String, +} diff --git a/src/sql/api/public_ids.rs b/src/sql/api/public_ids.rs new file mode 100644 index 00000000..33aa6427 --- /dev/null +++ b/src/sql/api/public_ids.rs @@ -0,0 +1,69 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::time::{SystemTime, UNIX_EPOCH}; + +const ID_LENGTH: usize = 10; + +const ALPHABET: &[u8; 62] = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + +pub enum IdTypes { + ApiKey, + ConnectionProfile, + Schema, + Pipeline, + JobConfig, + Checkpoint, + JobStatus, + ClusterInfo, + JobLogMessage, + ConnectionTable, + ConnectionTablePipeline, + Udf, +} + +/// Generates a unique identifier with a type-specific prefix. +/// +/// Uses a simple time + random approach instead of nanoid to avoid an extra dependency. +pub fn generate_id(id_type: IdTypes) -> String { + let prefix = match id_type { + IdTypes::ApiKey => "ak", + IdTypes::ConnectionProfile => "cp", + IdTypes::Schema => "sch", + IdTypes::Pipeline => "pl", + IdTypes::JobConfig => "job", + IdTypes::Checkpoint => "chk", + IdTypes::JobStatus => "js", + IdTypes::ClusterInfo => "ci", + IdTypes::JobLogMessage => "jlm", + IdTypes::ConnectionTable => "ct", + IdTypes::ConnectionTablePipeline => "ctp", + IdTypes::Udf => "udf", + }; + + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos(); + + let mut id = String::with_capacity(ID_LENGTH); + let mut seed = nanos; + for _ in 0..ID_LENGTH { + seed ^= seed + .wrapping_mul(6364136223846793005) + .wrapping_add(1442695040888963407); + let idx = (seed % ALPHABET.len() as u128) as usize; + id.push(ALPHABET[idx] as char); + } + + format!("{prefix}_{id}") +} diff --git a/src/sql/api/schema_resolver.rs b/src/sql/api/schema_resolver.rs new file mode 100644 index 00000000..57d3d702 --- /dev/null +++ b/src/sql/api/schema_resolver.rs @@ -0,0 +1,94 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use async_trait::async_trait; + +/// Trait for resolving schemas by ID (e.g., from a schema registry). +#[async_trait] +pub trait SchemaResolver: Send { + async fn resolve_schema(&self, id: u32) -> Result, String>; +} + +/// A resolver that always fails — used when no schema registry is configured. +pub struct FailingSchemaResolver; + +impl Default for FailingSchemaResolver { + fn default() -> Self { + Self + } +} + +#[async_trait] +impl SchemaResolver for FailingSchemaResolver { + async fn resolve_schema(&self, id: u32) -> Result, String> { + Err(format!( + "Schema with id {id} not available, and no schema registry configured" + )) + } +} + +/// A resolver that returns a fixed schema for a known ID. +pub struct FixedSchemaResolver { + id: u32, + schema: String, +} + +impl FixedSchemaResolver { + pub fn new(id: u32, schema: String) -> Self { + FixedSchemaResolver { id, schema } + } +} + +#[async_trait] +impl SchemaResolver for FixedSchemaResolver { + async fn resolve_schema(&self, id: u32) -> Result, String> { + if id == self.id { + Ok(Some(self.schema.clone())) + } else { + Err(format!("Unexpected schema id {}, expected {}", id, self.id)) + } + } +} + +/// A caching wrapper around any `SchemaResolver`. +pub struct CachingSchemaResolver { + inner: R, + cache: tokio::sync::RwLock>, +} + +impl CachingSchemaResolver { + pub fn new(inner: R) -> Self { + Self { + inner, + cache: tokio::sync::RwLock::new(std::collections::HashMap::new()), + } + } +} + +#[async_trait] +impl SchemaResolver for CachingSchemaResolver { + async fn resolve_schema(&self, id: u32) -> Result, String> { + { + let cache = self.cache.read().await; + if let Some(schema) = cache.get(&id) { + return Ok(Some(schema.clone())); + } + } + + let result = self.inner.resolve_schema(id).await?; + if let Some(ref schema) = result { + let mut cache = self.cache.write().await; + cache.insert(id, schema.clone()); + } + Ok(result) + } +} diff --git a/src/sql/api/udfs.rs b/src/sql/api/udfs.rs new file mode 100644 index 00000000..781d5b07 --- /dev/null +++ b/src/sql/api/udfs.rs @@ -0,0 +1,68 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use serde::{Deserialize, Serialize}; + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct Udf { + pub definition: String, + #[serde(default)] + pub language: UdfLanguage, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct ValidateUdfPost { + pub definition: String, + #[serde(default)] + pub language: UdfLanguage, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct UdfValidationResult { + pub udf_name: Option, + pub errors: Vec, +} + +#[derive(Serialize, Deserialize, Copy, Clone, Debug, Default, Eq, PartialEq)] +#[serde(rename_all = "snake_case")] +pub enum UdfLanguage { + Python, + #[default] + Rust, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct UdfPost { + pub prefix: String, + #[serde(default)] + pub language: UdfLanguage, + pub definition: String, + pub description: Option, +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "snake_case")] +pub struct GlobalUdf { + pub id: String, + pub prefix: String, + pub name: String, + pub language: UdfLanguage, + pub created_at: u64, + pub updated_at: u64, + pub definition: String, + pub description: Option, + pub dylib_url: Option, +} diff --git a/src/sql/api/var_str.rs b/src/sql/api/var_str.rs new file mode 100644 index 00000000..2638cd06 --- /dev/null +++ b/src/sql/api/var_str.rs @@ -0,0 +1,91 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use serde::{Deserialize, Serialize}; +use std::env; + +/// A string that may contain `{{ VAR }}` placeholders for environment variable substitution. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[serde(transparent)] +pub struct VarStr { + raw_val: String, +} + +impl VarStr { + pub fn new(raw_val: String) -> Self { + VarStr { raw_val } + } + + pub fn raw(&self) -> &str { + &self.raw_val + } + + /// Substitute `{{ VAR_NAME }}` patterns with the corresponding environment variable values. + pub fn sub_env_vars(&self) -> anyhow::Result { + let mut result = self.raw_val.clone(); + let mut start = 0; + + while let Some(open) = result[start..].find("{{") { + let open_abs = start + open; + let Some(close) = result[open_abs..].find("}}") else { + break; + }; + let close_abs = open_abs + close; + + let var_name = result[open_abs + 2..close_abs].trim(); + if var_name.is_empty() { + start = close_abs + 2; + continue; + } + + match env::var(var_name) { + Ok(value) => { + let full_match = &result[open_abs..close_abs + 2]; + let full_match_owned = full_match.to_string(); + result = result.replacen(&full_match_owned, &value, 1); + start = open_abs + value.len(); + } + Err(_) => { + anyhow::bail!("Environment variable {} not found", var_name); + } + } + } + + Ok(result) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_no_placeholders() { + let input = "This is a test string with no placeholders"; + assert_eq!( + VarStr::new(input.to_string()).sub_env_vars().unwrap(), + input + ); + } + + #[test] + fn test_with_placeholders() { + unsafe { env::set_var("FS_TEST_VAR", "environment variable") }; + let input = "This is a {{ FS_TEST_VAR }}"; + let expected = "This is a environment variable"; + assert_eq!( + VarStr::new(input.to_string()).sub_env_vars().unwrap(), + expected + ); + unsafe { env::remove_var("FS_TEST_VAR") }; + } +} diff --git a/src/sql/common/arrow_ext.rs b/src/sql/common/arrow_ext.rs new file mode 100644 index 00000000..f041ec6f --- /dev/null +++ b/src/sql/common/arrow_ext.rs @@ -0,0 +1,182 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::fmt::{Display, Formatter}; +use std::time::SystemTime; + +use datafusion::arrow::datatypes::{DataType, Field, TimeUnit}; + +pub struct DisplayAsSql<'a>(pub &'a DataType); + +impl Display for DisplayAsSql<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self.0 { + DataType::Boolean => write!(f, "BOOLEAN"), + DataType::Int8 | DataType::Int16 | DataType::Int32 => write!(f, "INT"), + DataType::Int64 => write!(f, "BIGINT"), + DataType::UInt8 | DataType::UInt16 | DataType::UInt32 => write!(f, "INT UNSIGNED"), + DataType::UInt64 => write!(f, "BIGINT UNSIGNED"), + DataType::Float16 | DataType::Float32 => write!(f, "FLOAT"), + DataType::Float64 => write!(f, "DOUBLE"), + DataType::Timestamp(_, _) => write!(f, "TIMESTAMP"), + DataType::Date32 => write!(f, "DATE"), + DataType::Date64 => write!(f, "DATETIME"), + DataType::Time32(_) => write!(f, "TIME"), + DataType::Time64(_) => write!(f, "TIME"), + DataType::Duration(_) => write!(f, "INTERVAL"), + DataType::Interval(_) => write!(f, "INTERVAL"), + DataType::Binary | DataType::FixedSizeBinary(_) | DataType::LargeBinary => { + write!(f, "BYTEA") + } + DataType::Utf8 | DataType::LargeUtf8 => write!(f, "TEXT"), + DataType::List(inner) => { + write!(f, "{}[]", DisplayAsSql(inner.data_type())) + } + dt => write!(f, "{dt}"), + } + } +} + +/// Arrow extension type markers for FunctionStream-specific semantics. +#[allow(clippy::upper_case_acronyms)] +#[derive(Copy, Clone, Eq, PartialEq, Debug)] +pub enum FsExtensionType { + JSON, +} + +impl FsExtensionType { + pub fn from_map(map: &HashMap) -> Option { + match map.get("ARROW:extension:name")?.as_str() { + "functionstream.json" => Some(Self::JSON), + _ => None, + } + } + + pub fn add_metadata(v: Option, field: Field) -> Field { + if let Some(v) = v { + let mut m = HashMap::new(); + match v { + FsExtensionType::JSON => { + m.insert( + "ARROW:extension:name".to_string(), + "functionstream.json".to_string(), + ); + } + } + field.with_metadata(m) + } else { + field + } + } +} + +pub trait GetArrowType { + fn arrow_type() -> DataType; +} + +pub trait GetArrowSchema { + fn arrow_schema() -> datafusion::arrow::datatypes::Schema; +} + +impl GetArrowType for T +where + T: GetArrowSchema, +{ + fn arrow_type() -> DataType { + DataType::Struct(Self::arrow_schema().fields.clone()) + } +} + +impl GetArrowType for bool { + fn arrow_type() -> DataType { + DataType::Boolean + } +} + +impl GetArrowType for i8 { + fn arrow_type() -> DataType { + DataType::Int8 + } +} + +impl GetArrowType for i16 { + fn arrow_type() -> DataType { + DataType::Int16 + } +} + +impl GetArrowType for i32 { + fn arrow_type() -> DataType { + DataType::Int32 + } +} + +impl GetArrowType for i64 { + fn arrow_type() -> DataType { + DataType::Int64 + } +} + +impl GetArrowType for u8 { + fn arrow_type() -> DataType { + DataType::UInt8 + } +} + +impl GetArrowType for u16 { + fn arrow_type() -> DataType { + DataType::UInt16 + } +} + +impl GetArrowType for u32 { + fn arrow_type() -> DataType { + DataType::UInt32 + } +} + +impl GetArrowType for u64 { + fn arrow_type() -> DataType { + DataType::UInt64 + } +} + +impl GetArrowType for f32 { + fn arrow_type() -> DataType { + DataType::Float32 + } +} + +impl GetArrowType for f64 { + fn arrow_type() -> DataType { + DataType::Float64 + } +} + +impl GetArrowType for String { + fn arrow_type() -> DataType { + DataType::Utf8 + } +} + +impl GetArrowType for Vec { + fn arrow_type() -> DataType { + DataType::Binary + } +} + +impl GetArrowType for SystemTime { + fn arrow_type() -> DataType { + DataType::Timestamp(TimeUnit::Nanosecond, None) + } +} diff --git a/src/sql/common/connector_options.rs b/src/sql/common/connector_options.rs new file mode 100644 index 00000000..e2e306b6 --- /dev/null +++ b/src/sql/common/connector_options.rs @@ -0,0 +1,434 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::{BTreeMap, HashMap}; +use std::num::{NonZero, NonZeroU64}; +use std::str::FromStr; +use std::time::Duration; + +use datafusion::common::{Result as DFResult, plan_datafusion_err}; +use datafusion::error::DataFusionError; +use datafusion::sql::sqlparser::ast::{Expr, Ident, SqlOption, Value as SqlValue, ValueWithSpan}; +use tracing::warn; + +use super::constants::{interval_duration_unit, with_opt_bool_str}; + +pub trait FromOpts: Sized { + fn from_opts(opts: &mut ConnectorOptions) -> DFResult; +} + +pub struct ConnectorOptions { + options: HashMap, + partitions: Vec, +} + +fn sql_expr_to_catalog_string(e: &Expr) -> String { + match e { + Expr::Value(ValueWithSpan { value, .. }) => match value { + SqlValue::SingleQuotedString(s) | SqlValue::DoubleQuotedString(s) => s.clone(), + SqlValue::NationalStringLiteral(s) => s.clone(), + SqlValue::HexStringLiteral(s) => s.clone(), + SqlValue::Number(n, _) => n.clone(), + SqlValue::Boolean(b) => b.to_string(), + SqlValue::Null => "NULL".to_string(), + other => other.to_string(), + }, + Expr::Identifier(ident) => ident.value.clone(), + other => other.to_string(), + } +} + +impl ConnectorOptions { + /// Build options from persisted catalog string maps (same semantics as SQL `WITH` literals). + pub fn from_flat_string_map(map: HashMap) -> DFResult { + let mut options = HashMap::with_capacity(map.len()); + for (k, v) in map { + options.insert( + k, + Expr::Value(SqlValue::SingleQuotedString(v).with_empty_span()), + ); + } + Ok(Self { + options, + partitions: Vec::new(), + }) + } + + pub fn new(sql_opts: &[SqlOption], partition_by: &Option>) -> DFResult { + let mut options = HashMap::new(); + + for option in sql_opts { + let SqlOption::KeyValue { key, value } = option else { + return Err(plan_datafusion_err!( + "invalid with option: '{}'; expected an `=` delimited key-value pair", + option + )); + }; + + options.insert(key.value.clone(), value.clone()); + } + + Ok(Self { + options, + partitions: partition_by.clone().unwrap_or_default(), + }) + } + + pub fn partitions(&self) -> &[Expr] { + &self.partitions + } + + pub fn pull_struct(&mut self) -> DFResult { + T::from_opts(self) + } + + pub fn pull_opt_str(&mut self, name: &str) -> DFResult> { + match self.options.remove(name) { + Some(Expr::Value(ValueWithSpan { + value: SqlValue::SingleQuotedString(s), + span: _, + })) => Ok(Some(s)), + Some(e) => Err(plan_datafusion_err!( + "expected with option '{}' to be a single-quoted string, but it was `{:?}`", + name, + e + )), + None => Ok(None), + } + } + + pub fn pull_str(&mut self, name: &str) -> DFResult { + self.pull_opt_str(name)? + .ok_or_else(|| plan_datafusion_err!("required option '{}' not set", name)) + } + + pub fn pull_opt_bool(&mut self, name: &str) -> DFResult> { + match self.options.remove(name) { + Some(Expr::Value(ValueWithSpan { + value: SqlValue::Boolean(b), + span: _, + })) => Ok(Some(b)), + Some(Expr::Value(ValueWithSpan { + value: SqlValue::SingleQuotedString(s), + span: _, + })) => match s.as_str() { + with_opt_bool_str::TRUE | with_opt_bool_str::YES => Ok(Some(true)), + with_opt_bool_str::FALSE | with_opt_bool_str::NO => Ok(Some(false)), + _ => Err(plan_datafusion_err!( + "expected with option '{}' to be a boolean, but it was `'{}'`", + name, + s + )), + }, + Some(e) => Err(plan_datafusion_err!( + "expected with option '{}' to be a boolean, but it was `{:?}`", + name, + e + )), + None => Ok(None), + } + } + + pub fn pull_opt_u64(&mut self, name: &str) -> DFResult> { + match self.options.remove(name) { + Some(Expr::Value(ValueWithSpan { + value: SqlValue::Number(s, _), + span: _, + })) + | Some(Expr::Value(ValueWithSpan { + value: SqlValue::SingleQuotedString(s), + span: _, + })) => s.parse::().map(Some).map_err(|_| { + plan_datafusion_err!( + "expected with option '{}' to be an unsigned integer, but it was `{}`", + name, + s + ) + }), + Some(e) => Err(plan_datafusion_err!( + "expected with option '{}' to be an unsigned integer, but it was `{:?}`", + name, + e + )), + None => Ok(None), + } + } + + pub fn pull_opt_nonzero_u64(&mut self, name: &str) -> DFResult>> { + match self.pull_opt_u64(name)? { + Some(0) => Err(plan_datafusion_err!( + "expected with option '{name}' to be greater than 0, but it was 0" + )), + Some(i) => Ok(Some(NonZeroU64::new(i).unwrap())), + None => Ok(None), + } + } + + pub fn pull_opt_data_size_bytes(&mut self, name: &str) -> DFResult> { + self.pull_opt_str(name)? + .map(|s| { + s.parse::().map_err(|_| { + plan_datafusion_err!( + "expected with option '{}' to be a size in bytes (unsigned integer), but it was `{}`", + name, + s + ) + }) + }) + .transpose() + } + + pub fn pull_opt_i64(&mut self, name: &str) -> DFResult> { + match self.options.remove(name) { + Some(Expr::Value(ValueWithSpan { + value: SqlValue::Number(s, _), + span: _, + })) + | Some(Expr::Value(ValueWithSpan { + value: SqlValue::SingleQuotedString(s), + span: _, + })) => s.parse::().map(Some).map_err(|_| { + plan_datafusion_err!( + "expected with option '{}' to be an integer, but it was `{}`", + name, + s + ) + }), + Some(e) => Err(plan_datafusion_err!( + "expected with option '{}' to be an integer, but it was `{:?}`", + name, + e + )), + None => Ok(None), + } + } + + pub fn pull_i64(&mut self, name: &str) -> DFResult { + self.pull_opt_i64(name)? + .ok_or_else(|| plan_datafusion_err!("required option '{}' not set", name)) + } + + pub fn pull_u64(&mut self, name: &str) -> DFResult { + self.pull_opt_u64(name)? + .ok_or_else(|| plan_datafusion_err!("required option '{}' not set", name)) + } + + pub fn pull_opt_f64(&mut self, name: &str) -> DFResult> { + match self.options.remove(name) { + Some(Expr::Value(ValueWithSpan { + value: SqlValue::Number(s, _), + span: _, + })) + | Some(Expr::Value(ValueWithSpan { + value: SqlValue::SingleQuotedString(s), + span: _, + })) => s.parse::().map(Some).map_err(|_| { + plan_datafusion_err!( + "expected with option '{}' to be a double, but it was `{}`", + name, + s + ) + }), + Some(e) => Err(plan_datafusion_err!( + "expected with option '{}' to be a double, but it was `{:?}`", + name, + e + )), + None => Ok(None), + } + } + + pub fn pull_f64(&mut self, name: &str) -> DFResult { + self.pull_opt_f64(name)? + .ok_or_else(|| plan_datafusion_err!("required option '{}' not set", name)) + } + + pub fn pull_bool(&mut self, name: &str) -> DFResult { + self.pull_opt_bool(name)? + .ok_or_else(|| plan_datafusion_err!("required option '{}' not set", name)) + } + + pub fn pull_opt_duration(&mut self, name: &str) -> DFResult> { + match self.options.remove(name) { + Some(e) => Ok(Some(duration_from_sql_expr(&e).map_err(|e| { + plan_datafusion_err!("in with clause '{name}': {}", e) + })?)), + None => Ok(None), + } + } + + pub fn pull_opt_field(&mut self, name: &str) -> DFResult> { + match self.options.remove(name) { + Some(Expr::Value(ValueWithSpan { + value: SqlValue::SingleQuotedString(s), + span: _, + })) => { + warn!( + "Referred to a field in `{name}` with a string—this is deprecated and will be unsupported after Arroyo 0.14" + ); + Ok(Some(s)) + } + Some(Expr::Identifier(Ident { value, .. })) => Ok(Some(value)), + Some(e) => Err(plan_datafusion_err!( + "expected with option '{}' to be a field, but it was `{:?}`", + name, + e + )), + None => Ok(None), + } + } + + pub fn pull_opt_array(&mut self, name: &str) -> Option> { + Some(match self.options.remove(name)? { + Expr::Value(ValueWithSpan { + value: SqlValue::SingleQuotedString(s), + span, + }) => s + .split(',') + .map(|p| { + Expr::Value(ValueWithSpan { + value: SqlValue::SingleQuotedString(p.to_string()), + span, + }) + }) + .collect(), + Expr::Array(a) => a.elem, + e => vec![e], + }) + } + + pub fn pull_opt_parsed(&mut self, name: &str) -> DFResult> { + Ok(match self.pull_opt_str(name)? { + Some(s) => Some( + s.parse() + .map_err(|_| plan_datafusion_err!("invalid value '{s}' for {name}"))?, + ), + None => None, + }) + } + + pub fn keys(&self) -> impl Iterator { + self.options.keys() + } + + pub fn keys_with_prefix<'a, 'b>( + &'a self, + prefix: &'b str, + ) -> impl Iterator + 'b + where + 'a: 'b, + { + self.options.keys().filter(move |k| k.starts_with(prefix)) + } + + pub fn insert_str( + &mut self, + name: impl Into, + value: impl Into, + ) -> DFResult> { + let name = name.into(); + let value = value.into(); + let existing = self.pull_opt_str(&name)?; + self.options.insert( + name, + Expr::Value(SqlValue::SingleQuotedString(value).with_empty_span()), + ); + Ok(existing) + } + + pub fn is_empty(&self) -> bool { + self.options.is_empty() + } + + pub fn contains_key(&self, key: &str) -> bool { + self.options.contains_key(key) + } + + /// Drain all remaining options into string values (for connector runtime config). + pub fn drain_remaining_string_values(&mut self) -> DFResult> { + let taken = std::mem::take(&mut self.options); + let mut out = HashMap::with_capacity(taken.len()); + for (k, v) in taken { + out.insert(k, format!("{v}")); + } + Ok(out) + } + + /// Snapshot of all current `WITH` key/value pairs for catalog persistence (`SHOW CREATE TABLE`). + /// Call before any `pull_*` consumes options. + pub fn snapshot_for_catalog(&self) -> BTreeMap { + self.options + .iter() + .map(|(k, v)| (k.clone(), sql_expr_to_catalog_string(v))) + .collect() + } +} + +fn duration_from_sql_expr(expr: &Expr) -> Result { + match expr { + Expr::Interval(interval) => { + let s = match interval.value.as_ref() { + Expr::Value(ValueWithSpan { + value: SqlValue::SingleQuotedString(s), + .. + }) => s.clone(), + other => { + return Err(DataFusionError::Plan(format!( + "expected interval string literal, found {other}" + ))); + } + }; + parse_interval_to_duration(&s) + } + Expr::Value(ValueWithSpan { + value: SqlValue::SingleQuotedString(s), + .. + }) => parse_interval_to_duration(s), + other => Err(DataFusionError::Plan(format!( + "expected an interval expression, found {other}" + ))), + } +} + +fn parse_interval_to_duration(s: &str) -> Result { + let parts: Vec<&str> = s.split_whitespace().collect(); + if parts.len() != 2 { + return Err(DataFusionError::Plan(format!( + "invalid interval string '{s}'; expected ' '" + ))); + } + let value: u64 = parts[0] + .parse() + .map_err(|_| DataFusionError::Plan(format!("invalid interval number: {}", parts[0])))?; + let unit_lc = parts[1].to_lowercase(); + let unit = unit_lc.as_str(); + let duration = match unit { + interval_duration_unit::SECOND + | interval_duration_unit::SECONDS + | interval_duration_unit::S => Duration::from_secs(value), + interval_duration_unit::MINUTE + | interval_duration_unit::MINUTES + | interval_duration_unit::MIN => Duration::from_secs(value * 60), + interval_duration_unit::HOUR + | interval_duration_unit::HOURS + | interval_duration_unit::H => Duration::from_secs(value * 3600), + interval_duration_unit::DAY | interval_duration_unit::DAYS | interval_duration_unit::D => { + Duration::from_secs(value * 86400) + } + unit => { + return Err(DataFusionError::Plan(format!( + "unsupported interval unit '{unit}'" + ))); + } + }; + Ok(duration) +} diff --git a/src/sql/common/constants.rs b/src/sql/common/constants.rs new file mode 100644 index 00000000..40642cd7 --- /dev/null +++ b/src/sql/common/constants.rs @@ -0,0 +1,276 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod scalar_fn { + pub const GET_FIRST_JSON_OBJECT: &str = "get_first_json_object"; + pub const EXTRACT_JSON: &str = "extract_json"; + pub const EXTRACT_JSON_STRING: &str = "extract_json_string"; + pub const SERIALIZE_JSON_UNION: &str = "serialize_json_union"; + pub const MULTI_HASH: &str = "multi_hash"; +} + +pub mod window_fn { + pub const HOP: &str = "hop"; + pub const TUMBLE: &str = "tumble"; + pub const SESSION: &str = "session"; +} + +pub mod planning_placeholder_udf { + pub const UNNEST: &str = "unnest"; + pub const ROW_TIME: &str = "row_time"; + pub const LIST_ELEMENT_FIELD: &str = "field"; +} + +pub mod operator_feature { + pub const ASYNC_UDF: &str = "async-udf"; + pub const JOIN_WITH_EXPIRATION: &str = "join-with-expiration"; + pub const WINDOWED_JOIN: &str = "windowed-join"; + pub const SQL_WINDOW_FUNCTION: &str = "sql-window-function"; + pub const LOOKUP_JOIN: &str = "lookup-join"; + pub const SQL_TUMBLING_WINDOW_AGGREGATE: &str = "sql-tumbling-window-aggregate"; + pub const SQL_SLIDING_WINDOW_AGGREGATE: &str = "sql-sliding-window-aggregate"; + pub const SQL_SESSION_WINDOW_AGGREGATE: &str = "sql-session-window-aggregate"; + pub const SQL_UPDATING_AGGREGATE: &str = "sql-updating-aggregate"; + pub const KEY_BY_ROUTING: &str = "key-by-routing"; + pub const CONNECTOR_SOURCE: &str = "connector-source"; + pub const CONNECTOR_SINK: &str = "connector-sink"; +} + +pub mod extension_node { + pub const STREAM_WINDOW_AGGREGATE: &str = "StreamWindowAggregateNode"; + pub const STREAMING_WINDOW_FUNCTION: &str = "StreamingWindowFunctionNode"; + pub const EVENT_TIME_WATERMARK: &str = "EventTimeWatermarkNode"; + pub const CONTINUOUS_AGGREGATE: &str = "ContinuousAggregateNode"; + pub const SYSTEM_TIMESTAMP_INJECTOR: &str = "SystemTimestampInjectorNode"; + pub const STREAM_INGESTION: &str = "StreamIngestionNode"; + pub const STREAM_EGRESS: &str = "StreamEgressNode"; + pub const STREAM_PROJECTION: &str = "StreamProjectionNode"; + pub const REMOTE_TABLE_BOUNDARY: &str = "RemoteTableBoundaryNode"; + pub const REFERENCE_TABLE_SOURCE: &str = "ReferenceTableSource"; + pub const STREAM_REFERENCE_JOIN: &str = "StreamReferenceJoin"; + pub const KEY_EXTRACTION: &str = "KeyExtractionNode"; + pub const STREAMING_JOIN: &str = "StreamingJoinNode"; + pub const ASYNC_FUNCTION_EXECUTION: &str = "AsyncFunctionExecutionNode"; + pub const UNROLL_DEBEZIUM_PAYLOAD: &str = "UnrollDebeziumPayloadNode"; + pub const PACK_DEBEZIUM_ENVELOPE: &str = "PackDebeziumEnvelopeNode"; +} + +pub mod proto_operator_name { + pub const TUMBLING_WINDOW: &str = "TumblingWindow"; + pub const UPDATING_AGGREGATE: &str = "UpdatingAggregate"; + pub const WINDOW_FUNCTION: &str = "WindowFunction"; + pub const SLIDING_WINDOW_LABEL: &str = "sliding window"; + pub const INSTANT_WINDOW: &str = "InstantWindow"; + pub const INSTANT_WINDOW_LABEL: &str = "instant window"; +} + +pub mod runtime_operator_kind { + pub const STREAMING_JOIN: &str = "streaming_join"; + pub const WATERMARK_GENERATOR: &str = "watermark_generator"; + pub const STREAMING_WINDOW_EVALUATOR: &str = "streaming_window_evaluator"; +} + +pub mod factory_operator_name { + pub const CONNECTOR_SOURCE: &str = "ConnectorSource"; + pub const CONNECTOR_SINK: &str = "ConnectorSink"; + pub const KAFKA_SOURCE: &str = "KafkaSource"; + pub const KAFKA_SINK: &str = "KafkaSink"; +} + +pub mod cdc { + pub const BEFORE: &str = "before"; + pub const AFTER: &str = "after"; + pub const OP: &str = "op"; +} + +pub mod updating_state_field { + pub const IS_RETRACT: &str = "is_retract"; + pub const ID: &str = "id"; +} + +pub mod sql_field { + pub const ASYNC_RESULT: &str = "__async_result"; + pub const DEFAULT_KEY_LABEL: &str = "key"; + pub const DEFAULT_PROJECTION_LABEL: &str = "projection"; + pub const COMPUTED_WATERMARK: &str = "__watermark"; + pub const TIMESTAMP_FIELD: &str = "_timestamp"; + pub const UPDATING_META_FIELD: &str = "_updating_meta"; +} + +pub mod sql_planning_default { + pub const DEFAULT_PARALLELISM: usize = 4; + pub const PLANNING_TTL_SECS: u64 = 24 * 60 * 60; +} + +pub mod with_opt_bool_str { + pub const TRUE: &str = "true"; + pub const YES: &str = "yes"; + pub const FALSE: &str = "false"; + pub const NO: &str = "no"; +} + +pub mod interval_duration_unit { + pub const SECOND: &str = "second"; + pub const SECONDS: &str = "seconds"; + pub const S: &str = "s"; + pub const MINUTE: &str = "minute"; + pub const MINUTES: &str = "minutes"; + pub const MIN: &str = "min"; + pub const HOUR: &str = "hour"; + pub const HOURS: &str = "hours"; + pub const H: &str = "h"; + pub const DAY: &str = "day"; + pub const DAYS: &str = "days"; + pub const D: &str = "d"; +} + +pub mod connection_format_value { + pub const JSON: &str = "json"; + pub const DEBEZIUM_JSON: &str = "debezium_json"; + pub const AVRO: &str = "avro"; + pub const PARQUET: &str = "parquet"; + pub const PROTOBUF: &str = "protobuf"; + pub const RAW_STRING: &str = "raw_string"; + pub const RAW_BYTES: &str = "raw_bytes"; +} + +pub mod framing_method_value { + pub const NEWLINE: &str = "newline"; + pub const NEWLINE_DELIMITED: &str = "newline_delimited"; +} + +pub mod bad_data_value { + pub const FAIL: &str = "fail"; + pub const DROP: &str = "drop"; +} + +pub mod timestamp_format_value { + pub const RFC3339_SNAKE: &str = "rfc3339"; + pub const RFC3339_UPPER: &str = "RFC3339"; + pub const UNIX_MILLIS_SNAKE: &str = "unix_millis"; + pub const UNIX_MILLIS_PASCAL: &str = "UnixMillis"; +} + +pub mod decimal_encoding_value { + pub const NUMBER: &str = "number"; + pub const STRING: &str = "string"; + pub const BYTES: &str = "bytes"; +} + +pub mod json_compression_value { + pub const UNCOMPRESSED: &str = "uncompressed"; + pub const GZIP: &str = "gzip"; +} + +pub mod parquet_compression_value { + pub const UNCOMPRESSED: &str = "uncompressed"; + pub const SNAPPY: &str = "snappy"; + pub const GZIP: &str = "gzip"; + pub const ZSTD: &str = "zstd"; + pub const LZ4: &str = "lz4"; + pub const LZ4_RAW: &str = "lz4_raw"; +} + +pub mod date_part_keyword { + pub const YEAR: &str = "year"; + pub const MONTH: &str = "month"; + pub const WEEK: &str = "week"; + pub const DAY: &str = "day"; + pub const HOUR: &str = "hour"; + pub const MINUTE: &str = "minute"; + pub const SECOND: &str = "second"; + pub const MILLISECOND: &str = "millisecond"; + pub const MICROSECOND: &str = "microsecond"; + pub const NANOSECOND: &str = "nanosecond"; + pub const DOW: &str = "dow"; + pub const DOY: &str = "doy"; +} + +pub mod date_trunc_keyword { + pub const YEAR: &str = "year"; + pub const QUARTER: &str = "quarter"; + pub const MONTH: &str = "month"; + pub const WEEK: &str = "week"; + pub const DAY: &str = "day"; + pub const HOUR: &str = "hour"; + pub const MINUTE: &str = "minute"; + pub const SECOND: &str = "second"; +} + +pub mod mem_exec_join_side { + pub const LEFT: &str = "left"; + pub const RIGHT: &str = "right"; +} + +pub mod physical_plan_node_name { + pub const RW_LOCK_READER: &str = "rw_lock_reader"; + pub const UNBOUNDED_READER: &str = "unbounded_reader"; + pub const VEC_READER: &str = "vec_reader"; + pub const MEM_EXEC: &str = "mem_exec"; + pub const DEBEZIUM_UNROLLING_EXEC: &str = "debezium_unrolling_exec"; + pub const TO_DEBEZIUM_EXEC: &str = "to_debezium_exec"; +} + +pub mod window_function_udf { + pub const NAME: &str = "window"; +} + +pub mod window_interval_field { + pub const START: &str = "start"; + pub const END: &str = "end"; +} + +pub mod debezium_op_short { + pub const CREATE: &str = "c"; + pub const READ: &str = "r"; + pub const UPDATE: &str = "u"; + pub const DELETE: &str = "d"; +} + +pub mod connector_type { + pub const KAFKA: &str = "kafka"; + pub const KINESIS: &str = "kinesis"; + pub const FILESYSTEM: &str = "filesystem"; + pub const DELTA: &str = "delta"; + pub const ICEBERG: &str = "iceberg"; + pub const PULSAR: &str = "pulsar"; + pub const NATS: &str = "nats"; + pub const REDIS: &str = "redis"; + pub const MQTT: &str = "mqtt"; + pub const WEBSOCKET: &str = "websocket"; + pub const SSE: &str = "sse"; + pub const NEXMARK: &str = "nexmark"; + pub const BLACKHOLE: &str = "blackhole"; + pub const MEMORY: &str = "memory"; + pub const POSTGRES: &str = "postgres"; +} + +pub mod connection_table_role { + pub const SOURCE: &str = "source"; + pub const SINK: &str = "sink"; + pub const LOOKUP: &str = "lookup"; +} + +pub const SUPPORTED_CONNECTOR_ADAPTERS: &[&str] = &[connector_type::KAFKA]; + +pub mod kafka_with_value { + pub const SCAN_LATEST: &str = "latest"; + pub const SCAN_EARLIEST: &str = "earliest"; + pub const SCAN_GROUP_OFFSETS: &str = "group-offsets"; + pub const SCAN_GROUP: &str = "group"; + pub const ISOLATION_READ_COMMITTED: &str = "read_committed"; + pub const ISOLATION_READ_UNCOMMITTED: &str = "read_uncommitted"; + pub const SINK_COMMIT_EXACTLY_ONCE_HYPHEN: &str = "exactly-once"; + pub const SINK_COMMIT_EXACTLY_ONCE_UNDERSCORE: &str = "exactly_once"; + pub const SINK_COMMIT_AT_LEAST_ONCE_HYPHEN: &str = "at-least-once"; + pub const SINK_COMMIT_AT_LEAST_ONCE_UNDERSCORE: &str = "at_least_once"; +} diff --git a/src/sql/common/control.rs b/src/sql/common/control.rs new file mode 100644 index 00000000..eba88596 --- /dev/null +++ b/src/sql/common/control.rs @@ -0,0 +1,164 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::time::SystemTime; + +use crate::runtime::streaming::protocol::CheckpointBarrier; + +/// Control messages sent from the controller to worker tasks. +#[derive(Debug, Clone)] +pub enum ControlMessage { + Checkpoint(CheckpointBarrier), + Stop { + mode: StopMode, + }, + Commit { + epoch: u32, + commit_data: HashMap>>, + }, + LoadCompacted { + compacted: CompactionResult, + }, + NoOp, +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub enum StopMode { + Graceful, + Immediate, +} + +#[derive(Debug, Clone)] +pub struct CompactionResult { + pub operator_id: String, + pub compacted_tables: HashMap, +} + +#[derive(Debug, Clone)] +pub struct TableCheckpointMetadata { + pub table_type: TableType, + pub data: Vec, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum TableType { + GlobalKeyValue, + ExpiringKeyedTimeTable, +} + +/// Responses sent from worker tasks back to the controller. +#[derive(Debug, Clone)] +pub enum ControlResp { + CheckpointEvent(CheckpointEvent), + CheckpointCompleted(CheckpointCompleted), + TaskStarted { + node_id: u32, + task_index: usize, + start_time: SystemTime, + }, + TaskFinished { + node_id: u32, + task_index: usize, + }, + TaskFailed { + node_id: u32, + task_index: usize, + error: TaskError, + }, + Error { + node_id: u32, + operator_id: String, + task_index: usize, + message: String, + details: String, + }, +} + +#[derive(Debug, Clone)] +pub struct CheckpointCompleted { + pub checkpoint_epoch: u32, + pub node_id: u32, + pub operator_id: String, + pub subtask_metadata: SubtaskCheckpointMetadata, +} + +#[derive(Debug, Clone)] +pub struct SubtaskCheckpointMetadata { + pub subtask_index: u32, + pub start_time: u64, + pub finish_time: u64, + pub watermark: Option, + pub bytes: u64, + pub table_metadata: HashMap, + pub table_configs: HashMap, +} + +#[derive(Debug, Clone)] +pub struct TableSubtaskCheckpointMetadata { + pub subtask_index: u32, + pub table_type: TableType, + pub data: Vec, +} + +#[derive(Debug, Clone)] +pub struct TableConfig { + pub table_type: TableType, + pub config: Vec, + pub state_version: u32, +} + +#[derive(Debug, Clone)] +pub struct CheckpointEvent { + pub checkpoint_epoch: u32, + pub node_id: u32, + pub operator_id: String, + pub subtask_index: u32, + pub time: SystemTime, + pub event_type: TaskCheckpointEventType, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum TaskCheckpointEventType { + StartedAlignment, + StartedCheckpointing, + FinishedOperatorSetup, + FinishedSync, + FinishedCommit, +} + +#[derive(Debug, Clone)] +pub struct TaskError { + pub job_id: String, + pub node_id: u32, + pub operator_id: String, + pub operator_subtask: u64, + pub error: String, + pub error_domain: ErrorDomain, + pub retry_hint: RetryHint, + pub details: String, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ErrorDomain { + User, + Internal, + External, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum RetryHint { + NoRetry, + WithBackoff, +} diff --git a/src/sql/common/converter.rs b/src/sql/common/converter.rs new file mode 100644 index 00000000..a9023342 --- /dev/null +++ b/src/sql/common/converter.rs @@ -0,0 +1,95 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use arrow::row::{OwnedRow, RowConverter, RowParser, Rows, SortField}; +use arrow_array::{Array, ArrayRef, BooleanArray}; +use arrow_schema::{ArrowError, DataType}; +use std::sync::Arc; + +// need to handle the empty case as a row converter without sort fields emits empty Rows. +#[derive(Debug)] +pub enum Converter { + RowConverter(RowConverter), + Empty(RowConverter, Arc), +} + +impl Converter { + pub fn new(sort_fields: Vec) -> Result { + if sort_fields.is_empty() { + let array = Arc::new(BooleanArray::from(vec![false])); + Ok(Self::Empty( + RowConverter::new(vec![SortField::new(DataType::Boolean)])?, + array, + )) + } else { + Ok(Self::RowConverter(RowConverter::new(sort_fields)?)) + } + } + + pub fn convert_columns(&self, columns: &[Arc]) -> Result { + match self { + Converter::RowConverter(row_converter) => { + Ok(row_converter.convert_columns(columns)?.row(0).owned()) + } + Converter::Empty(row_converter, array) => Ok(row_converter + .convert_columns(std::slice::from_ref(array))? + .row(0) + .owned()), + } + } + + pub fn convert_all_columns( + &self, + columns: &[Arc], + num_rows: usize, + ) -> Result { + match self { + Converter::RowConverter(row_converter) => Ok(row_converter.convert_columns(columns)?), + Converter::Empty(row_converter, _array) => { + let array = Arc::new(BooleanArray::from(vec![false; num_rows])); + Ok(row_converter.convert_columns(&[array])?) + } + } + } + + pub fn convert_rows( + &self, + rows: Vec>, + ) -> Result, ArrowError> { + match self { + Converter::RowConverter(row_converter) => Ok(row_converter.convert_rows(rows)?), + Converter::Empty(_row_converter, _array) => Ok(vec![]), + } + } + + pub fn convert_raw_rows(&self, row_bytes: Vec<&[u8]>) -> Result, ArrowError> { + match self { + Converter::RowConverter(row_converter) => { + let parser = row_converter.parser(); + let mut row_list = vec![]; + for bytes in row_bytes { + let row = parser.parse(bytes); + row_list.push(row); + } + Ok(row_converter.convert_rows(row_list)?) + } + Converter::Empty(_row_converter, _array) => Ok(vec![]), + } + } + + pub fn parser(&self) -> Option { + match self { + Converter::RowConverter(r) => Some(r.parser()), + Converter::Empty(_, _) => None, + } + } +} diff --git a/src/sql/common/date.rs b/src/sql/common/date.rs new file mode 100644 index 00000000..ec310326 --- /dev/null +++ b/src/sql/common/date.rs @@ -0,0 +1,86 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use serde::Serialize; +use std::convert::TryFrom; + +use super::constants::{date_part_keyword, date_trunc_keyword}; + +#[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Hash, Serialize)] +pub enum DatePart { + Year, + Month, + Week, + Day, + Hour, + Minute, + Second, + Millisecond, + Microsecond, + Nanosecond, + DayOfWeek, + DayOfYear, +} + +impl TryFrom<&str> for DatePart { + type Error = String; + + fn try_from(value: &str) -> Result { + let v = value.to_lowercase(); + match v.as_str() { + date_part_keyword::YEAR => Ok(DatePart::Year), + date_part_keyword::MONTH => Ok(DatePart::Month), + date_part_keyword::WEEK => Ok(DatePart::Week), + date_part_keyword::DAY => Ok(DatePart::Day), + date_part_keyword::HOUR => Ok(DatePart::Hour), + date_part_keyword::MINUTE => Ok(DatePart::Minute), + date_part_keyword::SECOND => Ok(DatePart::Second), + date_part_keyword::MILLISECOND => Ok(DatePart::Millisecond), + date_part_keyword::MICROSECOND => Ok(DatePart::Microsecond), + date_part_keyword::NANOSECOND => Ok(DatePart::Nanosecond), + date_part_keyword::DOW => Ok(DatePart::DayOfWeek), + date_part_keyword::DOY => Ok(DatePart::DayOfYear), + _ => Err(format!("'{value}' is not a valid DatePart")), + } + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, PartialOrd, Serialize)] +pub enum DateTruncPrecision { + Year, + Quarter, + Month, + Week, + Day, + Hour, + Minute, + Second, +} + +impl TryFrom<&str> for DateTruncPrecision { + type Error = String; + + fn try_from(value: &str) -> Result { + let v = value.to_lowercase(); + match v.as_str() { + date_trunc_keyword::YEAR => Ok(DateTruncPrecision::Year), + date_trunc_keyword::QUARTER => Ok(DateTruncPrecision::Quarter), + date_trunc_keyword::MONTH => Ok(DateTruncPrecision::Month), + date_trunc_keyword::WEEK => Ok(DateTruncPrecision::Week), + date_trunc_keyword::DAY => Ok(DateTruncPrecision::Day), + date_trunc_keyword::HOUR => Ok(DateTruncPrecision::Hour), + date_trunc_keyword::MINUTE => Ok(DateTruncPrecision::Minute), + date_trunc_keyword::SECOND => Ok(DateTruncPrecision::Second), + _ => Err(format!("'{value}' is not a valid DateTruncPrecision")), + } + } +} diff --git a/src/sql/common/debezium.rs b/src/sql/common/debezium.rs new file mode 100644 index 00000000..9dbc401f --- /dev/null +++ b/src/sql/common/debezium.rs @@ -0,0 +1,148 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use bincode::{Decode, Encode}; +use serde::{Deserialize, Serialize}; +use std::convert::TryFrom; +use std::fmt::Debug; + +pub trait Key: + Debug + Clone + Encode + Decode<()> + std::hash::Hash + PartialEq + Eq + Send + 'static +{ +} +impl + std::hash::Hash + PartialEq + Eq + Send + 'static> Key + for T +{ +} + +pub trait Data: Debug + Clone + Encode + Decode<()> + Send + 'static {} +impl + Send + 'static> Data for T {} + +#[derive(Debug, Clone, PartialEq, Encode, Decode, Serialize, Deserialize)] +pub enum UpdatingData { + Retract(T), + Update { old: T, new: T }, + Append(T), +} + +impl UpdatingData { + pub fn lower(&self) -> T { + match self { + UpdatingData::Retract(_) => panic!("cannot lower retractions"), + UpdatingData::Update { new, .. } => new.clone(), + UpdatingData::Append(t) => t.clone(), + } + } + + pub fn unwrap_append(&self) -> &T { + match self { + UpdatingData::Append(t) => t, + _ => panic!("UpdatingData is not an append"), + } + } +} + +#[derive(Clone, Encode, Decode, Debug, Serialize, Deserialize, PartialEq)] +#[serde(try_from = "DebeziumShadow")] +pub struct Debezium { + pub before: Option, + pub after: Option, + pub op: DebeziumOp, +} + +#[derive(Clone, Encode, Decode, Debug, Serialize, Deserialize, PartialEq)] +struct DebeziumShadow { + before: Option, + after: Option, + op: DebeziumOp, +} + +impl TryFrom> for Debezium { + type Error = &'static str; + + fn try_from(value: DebeziumShadow) -> Result { + match (value.op, &value.before, &value.after) { + (DebeziumOp::Create, _, None) => { + Err("`after` must be set for Debezium create messages") + } + (DebeziumOp::Update, None, _) => { + Err("`before` must be set for Debezium update messages") + } + (DebeziumOp::Update, _, None) => { + Err("`after` must be set for Debezium update messages") + } + (DebeziumOp::Delete, None, _) => { + Err("`before` must be set for Debezium delete messages") + } + _ => Ok(Debezium { + before: value.before, + after: value.after, + op: value.op, + }), + } + } +} + +#[derive(Copy, Clone, Encode, Decode, Debug, PartialEq)] +pub enum DebeziumOp { + Create, + Update, + Delete, +} + +#[allow(clippy::to_string_trait_impl)] +impl ToString for DebeziumOp { + fn to_string(&self) -> String { + match self { + DebeziumOp::Create => "c", + DebeziumOp::Update => "u", + DebeziumOp::Delete => "d", + } + .to_string() + } +} + +impl<'de> Deserialize<'de> for DebeziumOp { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + let s = String::deserialize(deserializer)?; + match s.as_str() { + "c" | "r" => Ok(DebeziumOp::Create), + "u" => Ok(DebeziumOp::Update), + "d" => Ok(DebeziumOp::Delete), + _ => Err(serde::de::Error::custom(format!("Invalid DebeziumOp {s}"))), + } + } +} + +impl Serialize for DebeziumOp { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + match self { + DebeziumOp::Create => serializer.serialize_str("c"), + DebeziumOp::Update => serializer.serialize_str("u"), + DebeziumOp::Delete => serializer.serialize_str("d"), + } + } +} + +#[derive(Copy, Clone, Encode, Decode, Debug, PartialEq, Serialize, Deserialize)] +pub enum JoinType { + Inner, + Left, + Right, + Full, +} diff --git a/src/sql/common/errors.rs b/src/sql/common/errors.rs new file mode 100644 index 00000000..fa4a722e --- /dev/null +++ b/src/sql/common/errors.rs @@ -0,0 +1,92 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt; + +/// Result type for streaming operators and collectors. +pub type DataflowResult = std::result::Result; + +/// Unified error type for streaming dataflow operations. +#[derive(Debug)] +pub enum DataflowError { + Arrow(arrow_schema::ArrowError), + DataFusion(datafusion::error::DataFusionError), + Operator(String), + State(String), + Connector(String), + Internal(String), +} + +impl fmt::Display for DataflowError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + DataflowError::Arrow(e) => write!(f, "Arrow error: {e}"), + DataflowError::DataFusion(e) => write!(f, "DataFusion error: {e}"), + DataflowError::Operator(msg) => write!(f, "Operator error: {msg}"), + DataflowError::State(msg) => write!(f, "State error: {msg}"), + DataflowError::Connector(msg) => write!(f, "Connector error: {msg}"), + DataflowError::Internal(msg) => write!(f, "Internal error: {msg}"), + } + } +} + +impl std::error::Error for DataflowError {} + +impl DataflowError { + pub fn with_operator(self, operator_id: impl Into) -> Self { + let id = operator_id.into(); + match self { + DataflowError::Operator(m) => DataflowError::Operator(format!("{id}: {m}")), + other => DataflowError::Operator(format!("{id}: {other}")), + } + } +} + +impl From for DataflowError { + fn from(e: arrow_schema::ArrowError) -> Self { + DataflowError::Arrow(e) + } +} + +impl From for DataflowError { + fn from(e: datafusion::error::DataFusionError) -> Self { + DataflowError::DataFusion(e) + } +} + +/// Macro for creating connector errors. +#[macro_export] +macro_rules! connector_err { + ($($arg:tt)*) => { + $crate::sql::common::errors::DataflowError::Connector(format!($($arg)*)) + }; +} + +/// State-related errors. +#[derive(Debug)] +pub enum StateError { + KeyNotFound(String), + SerializationError(String), + BackendError(String), +} + +impl fmt::Display for StateError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + StateError::KeyNotFound(key) => write!(f, "Key not found: {key}"), + StateError::SerializationError(msg) => write!(f, "Serialization error: {msg}"), + StateError::BackendError(msg) => write!(f, "State backend error: {msg}"), + } + } +} + +impl std::error::Error for StateError {} diff --git a/src/sql/common/format_from_opts.rs b/src/sql/common/format_from_opts.rs new file mode 100644 index 00000000..276235c1 --- /dev/null +++ b/src/sql/common/format_from_opts.rs @@ -0,0 +1,180 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Parse `WITH` clause format / framing / bad-data options (Arroyo-compatible keys). + +use std::str::FromStr; + +use datafusion::common::{Result as DFResult, plan_datafusion_err, plan_err}; + +use super::connector_options::ConnectorOptions; +use super::constants::{bad_data_value, connection_format_value, framing_method_value}; +use super::formats::{ + AvroFormat, BadData, DecimalEncoding, Format, Framing, JsonCompression, JsonFormat, + NewlineDelimitedFraming, ParquetCompression, ParquetFormat, ProtobufFormat, RawBytesFormat, + RawStringFormat, TimestampFormat, +}; +use super::with_option_keys as opt; + +impl JsonFormat { + pub fn from_opts(opts: &mut ConnectorOptions) -> DFResult { + let mut j = JsonFormat::default(); + if let Some(v) = opts.pull_opt_bool(opt::JSON_CONFLUENT_SCHEMA_REGISTRY)? { + j.confluent_schema_registry = v; + } + if let Some(v) = opts.pull_opt_u64(opt::JSON_CONFLUENT_SCHEMA_VERSION)? { + j.schema_id = Some(v as u32); + } + if let Some(v) = opts.pull_opt_bool(opt::JSON_INCLUDE_SCHEMA)? { + j.include_schema = v; + } + if let Some(v) = opts.pull_opt_bool(opt::JSON_DEBEZIUM)? { + j.debezium = v; + } + if let Some(v) = opts.pull_opt_bool(opt::JSON_UNSTRUCTURED)? { + j.unstructured = v; + } + if let Some(s) = opts.pull_opt_str(opt::JSON_TIMESTAMP_FORMAT)? { + j.timestamp_format = TimestampFormat::try_from(s.as_str()) + .map_err(|_| plan_datafusion_err!("invalid json.timestamp_format '{}'", s))?; + } + if let Some(s) = opts.pull_opt_str(opt::JSON_DECIMAL_ENCODING)? { + j.decimal_encoding = DecimalEncoding::try_from(s.as_str()) + .map_err(|_| plan_datafusion_err!("invalid json.decimal_encoding '{s}'"))?; + } + if let Some(s) = opts.pull_opt_str(opt::JSON_COMPRESSION)? { + j.compression = JsonCompression::from_str(&s) + .map_err(|e| plan_datafusion_err!("invalid json.compression: {e}"))?; + } + Ok(j) + } +} + +impl Format { + pub fn from_opts(opts: &mut ConnectorOptions) -> DFResult> { + let Some(name) = opts.pull_opt_str(opt::FORMAT)? else { + return Ok(None); + }; + let n = name.to_lowercase(); + match n.as_str() { + connection_format_value::JSON => Ok(Some(Format::Json(JsonFormat::from_opts(opts)?))), + connection_format_value::DEBEZIUM_JSON => { + let mut j = JsonFormat::from_opts(opts)?; + j.debezium = true; + Ok(Some(Format::Json(j))) + } + connection_format_value::AVRO => Ok(Some(Format::Avro(AvroFormat::from_opts(opts)?))), + connection_format_value::PARQUET => { + Ok(Some(Format::Parquet(ParquetFormat::from_opts(opts)?))) + } + connection_format_value::PROTOBUF => { + Ok(Some(Format::Protobuf(ProtobufFormat::from_opts(opts)?))) + } + connection_format_value::RAW_STRING => Ok(Some(Format::RawString(RawStringFormat {}))), + connection_format_value::RAW_BYTES => Ok(Some(Format::RawBytes(RawBytesFormat {}))), + _ => plan_err!("unknown format '{name}'"), + } + } +} + +impl AvroFormat { + fn from_opts(opts: &mut ConnectorOptions) -> DFResult { + let mut a = AvroFormat { + confluent_schema_registry: false, + raw_datums: false, + into_unstructured_json: false, + schema_id: None, + }; + if let Some(v) = opts.pull_opt_bool(opt::AVRO_CONFLUENT_SCHEMA_REGISTRY)? { + a.confluent_schema_registry = v; + } + if let Some(v) = opts.pull_opt_bool(opt::AVRO_RAW_DATUMS)? { + a.raw_datums = v; + } + if let Some(v) = opts.pull_opt_bool(opt::AVRO_INTO_UNSTRUCTURED_JSON)? { + a.into_unstructured_json = v; + } + if let Some(v) = opts.pull_opt_u64(opt::AVRO_SCHEMA_ID)? { + a.schema_id = Some(v as u32); + } + Ok(a) + } +} + +impl ParquetFormat { + fn from_opts(opts: &mut ConnectorOptions) -> DFResult { + let mut p = ParquetFormat::default(); + if let Some(s) = opts.pull_opt_str(opt::PARQUET_COMPRESSION)? { + p.compression = ParquetCompression::from_str(&s) + .map_err(|e| plan_datafusion_err!("invalid parquet.compression: {e}"))?; + } + if let Some(v) = opts.pull_opt_u64(opt::PARQUET_ROW_GROUP_BYTES)? { + p.row_group_bytes = Some(v); + } + Ok(p) + } +} + +impl ProtobufFormat { + fn from_opts(opts: &mut ConnectorOptions) -> DFResult { + let mut p = ProtobufFormat { + into_unstructured_json: false, + message_name: None, + compiled_schema: None, + confluent_schema_registry: false, + length_delimited: false, + }; + if let Some(v) = opts.pull_opt_bool(opt::PROTOBUF_INTO_UNSTRUCTURED_JSON)? { + p.into_unstructured_json = v; + } + if let Some(s) = opts.pull_opt_str(opt::PROTOBUF_MESSAGE_NAME)? { + p.message_name = Some(s); + } + if let Some(v) = opts.pull_opt_bool(opt::PROTOBUF_CONFLUENT_SCHEMA_REGISTRY)? { + p.confluent_schema_registry = v; + } + if let Some(v) = opts.pull_opt_bool(opt::PROTOBUF_LENGTH_DELIMITED)? { + p.length_delimited = v; + } + Ok(p) + } +} + +impl Framing { + pub fn from_opts(opts: &mut ConnectorOptions) -> DFResult> { + let method = opts.pull_opt_str(opt::FRAMING_METHOD)?; + match method.as_deref() { + None => Ok(None), + Some(framing_method_value::NEWLINE) | Some(framing_method_value::NEWLINE_DELIMITED) => { + let max = opts.pull_opt_u64(opt::FRAMING_MAX_LINE_LENGTH)?; + Ok(Some(Framing::Newline(NewlineDelimitedFraming { + max_line_length: max, + }))) + } + Some(other) => plan_err!("unknown framing.method '{other}'"), + } + } +} + +impl BadData { + pub fn from_opts(opts: &mut ConnectorOptions) -> DFResult { + let Some(s) = opts.pull_opt_str(opt::BAD_DATA)? else { + return Ok(BadData::Fail {}); + }; + let v = s.to_lowercase(); + match v.as_str() { + bad_data_value::FAIL => Ok(BadData::Fail {}), + bad_data_value::DROP => Ok(BadData::Drop {}), + _ => plan_err!("invalid bad_data '{s}'"), + } + } +} diff --git a/src/sql/common/formats.rs b/src/sql/common/formats.rs new file mode 100644 index 00000000..aad3ce18 --- /dev/null +++ b/src/sql/common/formats.rs @@ -0,0 +1,255 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use serde::{Deserialize, Serialize}; +use std::convert::TryFrom; +use std::fmt::{Display, Formatter}; +use std::str::FromStr; + +use super::constants::{ + connection_format_value, decimal_encoding_value, json_compression_value, + parquet_compression_value, timestamp_format_value, +}; + +#[derive(Copy, Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Default, Hash, PartialOrd)] +#[serde(rename_all = "snake_case")] +pub enum TimestampFormat { + #[default] + #[serde(rename = "rfc3339")] + RFC3339, + UnixMillis, +} + +impl TryFrom<&str> for TimestampFormat { + type Error = (); + + fn try_from(value: &str) -> Result { + match value { + timestamp_format_value::RFC3339_UPPER | timestamp_format_value::RFC3339_SNAKE => { + Ok(TimestampFormat::RFC3339) + } + timestamp_format_value::UNIX_MILLIS_PASCAL + | timestamp_format_value::UNIX_MILLIS_SNAKE => Ok(TimestampFormat::UnixMillis), + _ => Err(()), + } + } +} + +#[derive(Copy, Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Default, Hash, PartialOrd)] +#[serde(rename_all = "snake_case")] +pub enum DecimalEncoding { + #[default] + Number, + String, + Bytes, +} + +impl TryFrom<&str> for DecimalEncoding { + type Error = (); + + fn try_from(s: &str) -> Result { + match s { + decimal_encoding_value::NUMBER => Ok(Self::Number), + decimal_encoding_value::STRING => Ok(Self::String), + decimal_encoding_value::BYTES => Ok(Self::Bytes), + _ => Err(()), + } + } +} + +#[derive(Serialize, Deserialize, Default, Copy, Clone, Debug, PartialEq, Eq, Hash, PartialOrd)] +#[serde(rename_all = "snake_case")] +pub enum JsonCompression { + #[default] + Uncompressed, + Gzip, +} + +impl FromStr for JsonCompression { + type Err = String; + + fn from_str(s: &str) -> Result { + match s { + json_compression_value::UNCOMPRESSED => Ok(JsonCompression::Uncompressed), + json_compression_value::GZIP => Ok(JsonCompression::Gzip), + _ => Err(format!("invalid json compression '{s}'")), + } + } +} + +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Default, Hash, PartialOrd)] +#[serde(rename_all = "snake_case")] +pub struct JsonFormat { + #[serde(default)] + pub confluent_schema_registry: bool, + #[serde(default, alias = "confluent_schema_version")] + pub schema_id: Option, + #[serde(default)] + pub include_schema: bool, + #[serde(default)] + pub debezium: bool, + #[serde(default)] + pub unstructured: bool, + #[serde(default)] + pub timestamp_format: TimestampFormat, + #[serde(default)] + pub decimal_encoding: DecimalEncoding, + #[serde(default)] + pub compression: JsonCompression, +} + +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Hash, PartialOrd)] +#[serde(rename_all = "snake_case")] +pub struct RawStringFormat {} + +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Hash, PartialOrd)] +#[serde(rename_all = "snake_case")] +pub struct RawBytesFormat {} + +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Hash, PartialOrd)] +#[serde(rename_all = "snake_case")] +pub struct AvroFormat { + #[serde(default)] + pub confluent_schema_registry: bool, + #[serde(default)] + pub raw_datums: bool, + #[serde(default)] + pub into_unstructured_json: bool, + #[serde(default)] + pub schema_id: Option, +} + +impl AvroFormat { + pub fn new( + confluent_schema_registry: bool, + raw_datums: bool, + into_unstructured_json: bool, + ) -> Self { + Self { + confluent_schema_registry, + raw_datums, + into_unstructured_json, + schema_id: None, + } + } +} + +#[derive(Serialize, Deserialize, Copy, Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Default)] +#[serde(rename_all = "snake_case")] +pub enum ParquetCompression { + Uncompressed, + Snappy, + Gzip, + #[default] + Zstd, + Lz4, + Lz4Raw, +} + +impl FromStr for ParquetCompression { + type Err = String; + + fn from_str(s: &str) -> Result { + match s { + parquet_compression_value::UNCOMPRESSED => Ok(ParquetCompression::Uncompressed), + parquet_compression_value::SNAPPY => Ok(ParquetCompression::Snappy), + parquet_compression_value::GZIP => Ok(ParquetCompression::Gzip), + parquet_compression_value::ZSTD => Ok(ParquetCompression::Zstd), + parquet_compression_value::LZ4 => Ok(ParquetCompression::Lz4), + parquet_compression_value::LZ4_RAW => Ok(ParquetCompression::Lz4Raw), + _ => Err(format!("invalid parquet compression '{s}'")), + } + } +} + +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Hash, PartialOrd, Default)] +#[serde(rename_all = "snake_case")] +pub struct ParquetFormat { + #[serde(default)] + pub compression: ParquetCompression, + #[serde(default)] + pub row_group_bytes: Option, +} + +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Hash, PartialOrd)] +#[serde(rename_all = "snake_case")] +pub struct ProtobufFormat { + #[serde(default)] + pub into_unstructured_json: bool, + #[serde(default)] + pub message_name: Option, + #[serde(default)] + pub compiled_schema: Option>, + #[serde(default)] + pub confluent_schema_registry: bool, + #[serde(default)] + pub length_delimited: bool, +} + +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, Hash, PartialOrd)] +#[serde(rename_all = "snake_case", tag = "type")] +pub enum Format { + Json(JsonFormat), + Avro(AvroFormat), + Protobuf(ProtobufFormat), + Parquet(ParquetFormat), + RawString(RawStringFormat), + RawBytes(RawBytesFormat), +} + +impl Display for Format { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.write_str(self.name()) + } +} + +impl Format { + pub fn name(&self) -> &'static str { + match self { + Format::Json(_) => connection_format_value::JSON, + Format::Avro(_) => connection_format_value::AVRO, + Format::Protobuf(_) => connection_format_value::PROTOBUF, + Format::Parquet(_) => connection_format_value::PARQUET, + Format::RawString(_) => connection_format_value::RAW_STRING, + Format::RawBytes(_) => connection_format_value::RAW_BYTES, + } + } + + pub fn is_updating(&self) -> bool { + matches!(self, Format::Json(JsonFormat { debezium: true, .. })) + } +} + +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, Hash, PartialOrd)] +#[serde(rename_all = "snake_case", tag = "behavior")] +pub enum BadData { + Fail {}, + Drop {}, +} + +impl Default for BadData { + fn default() -> Self { + BadData::Fail {} + } +} + +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, Hash, PartialOrd)] +#[serde(rename_all = "snake_case", tag = "method")] +pub enum Framing { + Newline(NewlineDelimitedFraming), +} + +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, Hash, PartialOrd)] +#[serde(rename_all = "snake_case")] +pub struct NewlineDelimitedFraming { + pub max_line_length: Option, +} diff --git a/src/sql/common/fs_schema.rs b/src/sql/common/fs_schema.rs new file mode 100644 index 00000000..76a08537 --- /dev/null +++ b/src/sql/common/fs_schema.rs @@ -0,0 +1,470 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::{TIMESTAMP_FIELD, to_nanos}; +use crate::sql::common::converter::Converter; +use arrow::compute::kernels::cmp::gt_eq; +use arrow::compute::kernels::numeric::div; +use arrow::compute::{SortColumn, filter_record_batch, lexsort_to_indices, partition, take}; +use arrow::row::SortField; +use arrow_array::types::UInt64Type; +use arrow_array::{PrimitiveArray, UInt64Array}; +use datafusion::arrow::array::builder::{ArrayBuilder, make_builder}; +use datafusion::arrow::array::{RecordBatch, TimestampNanosecondArray}; +use datafusion::arrow::datatypes::{DataType, Field, FieldRef, Schema, SchemaBuilder, TimeUnit}; +use datafusion::arrow::error::ArrowError; +use datafusion::common::{DataFusionError, Result as DFResult}; +use protocol::function_stream_graph; +use serde::{Deserialize, Serialize}; +use std::ops::Range; +use std::sync::Arc; +use std::time::SystemTime; + +#[derive(Debug, Copy, Clone)] +pub enum FieldValueType<'a> { + Int64(Option), + UInt64(Option), + Int32(Option), + String(Option<&'a str>), + Bytes(Option<&'a [u8]>), +} + +pub type FsSchemaRef = Arc; + +#[derive(Debug, Clone, Eq, PartialEq, Hash, Serialize, Deserialize)] +pub struct FsSchema { + pub schema: Arc, + pub timestamp_index: usize, + key_indices: Option>, + /// If defined, these indices are used for routing (i.e., which subtask gets which piece of data) + routing_key_indices: Option>, +} + +impl TryFrom for FsSchema { + type Error = DataFusionError; + fn try_from(schema_proto: function_stream_graph::FsSchema) -> Result { + let schema: Schema = serde_json::from_str(&schema_proto.arrow_schema) + .map_err(|e| DataFusionError::Plan(format!("Invalid arrow schema: {e}")))?; + let timestamp_index = schema_proto.timestamp_index as usize; + + let key_indices = schema_proto.has_keys.then(|| { + schema_proto + .key_indices + .into_iter() + .map(|index| index as usize) + .collect() + }); + + let routing_key_indices = schema_proto.has_routing_keys.then(|| { + schema_proto + .routing_key_indices + .into_iter() + .map(|index| index as usize) + .collect() + }); + + Ok(Self { + schema: Arc::new(schema), + timestamp_index, + key_indices, + routing_key_indices, + }) + } +} + +impl From for function_stream_graph::FsSchema { + fn from(schema: FsSchema) -> Self { + let arrow_schema = serde_json::to_string(schema.schema.as_ref()).unwrap(); + let timestamp_index = schema.timestamp_index as u32; + + let has_keys = schema.key_indices.is_some(); + let key_indices = schema + .key_indices + .map(|ks| ks.into_iter().map(|index| index as u32).collect()) + .unwrap_or_default(); + + let has_routing_keys = schema.routing_key_indices.is_some(); + let routing_key_indices = schema + .routing_key_indices + .map(|ks| ks.into_iter().map(|index| index as u32).collect()) + .unwrap_or_default(); + + Self { + arrow_schema, + timestamp_index, + key_indices, + has_keys, + routing_key_indices, + has_routing_keys, + } + } +} + +impl FsSchema { + pub fn new( + schema: Arc, + timestamp_index: usize, + key_indices: Option>, + routing_key_indices: Option>, + ) -> Self { + Self { + schema, + timestamp_index, + key_indices, + routing_key_indices, + } + } + pub fn new_unkeyed(schema: Arc, timestamp_index: usize) -> Self { + Self { + schema, + timestamp_index, + key_indices: None, + routing_key_indices: None, + } + } + pub fn new_keyed(schema: Arc, timestamp_index: usize, key_indices: Vec) -> Self { + Self { + schema, + timestamp_index, + key_indices: Some(key_indices), + routing_key_indices: None, + } + } + + pub fn from_fields(mut fields: Vec) -> Self { + if !fields.iter().any(|f| f.name() == TIMESTAMP_FIELD) { + fields.push(Field::new( + TIMESTAMP_FIELD, + DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + )); + } + + Self::from_schema_keys(Arc::new(Schema::new(fields)), vec![]).unwrap() + } + + pub fn from_schema_unkeyed(schema: Arc) -> DFResult { + let timestamp_index = schema + .column_with_name(TIMESTAMP_FIELD) + .ok_or_else(|| { + DataFusionError::Plan(format!( + "no {TIMESTAMP_FIELD} field in schema, schema is {schema:?}" + )) + })? + .0; + + Ok(Self { + schema, + timestamp_index, + key_indices: None, + routing_key_indices: None, + }) + } + + pub fn from_schema_keys(schema: Arc, key_indices: Vec) -> DFResult { + let timestamp_index = schema + .column_with_name(TIMESTAMP_FIELD) + .ok_or_else(|| { + DataFusionError::Plan(format!( + "no {TIMESTAMP_FIELD} field in schema, schema is {schema:?}" + )) + })? + .0; + + Ok(Self { + schema, + timestamp_index, + key_indices: Some(key_indices), + routing_key_indices: None, + }) + } + + pub fn schema_without_timestamp(&self) -> Schema { + let mut builder = SchemaBuilder::from(self.schema.fields()); + builder.remove(self.timestamp_index); + builder.finish() + } + + pub fn remove_timestamp_column(&self, batch: &mut RecordBatch) { + batch.remove_column(self.timestamp_index); + } + + pub fn builders(&self) -> Vec> { + self.schema + .fields + .iter() + .map(|f| make_builder(f.data_type(), 8)) + .collect() + } + + pub fn timestamp_column<'a>(&self, batch: &'a RecordBatch) -> &'a TimestampNanosecondArray { + batch + .column(self.timestamp_index) + .as_any() + .downcast_ref::() + .unwrap() + } + + pub fn has_routing_keys(&self) -> bool { + self.routing_keys().map(|k| !k.is_empty()).unwrap_or(false) + } + + pub fn routing_keys(&self) -> Option<&Vec> { + self.routing_key_indices + .as_ref() + .or(self.key_indices.as_ref()) + } + + pub fn storage_keys(&self) -> Option<&Vec> { + self.key_indices.as_ref() + } + + pub fn clone_storage_key_indices(&self) -> Option> { + self.key_indices.clone() + } + + pub fn clone_routing_key_indices(&self) -> Option> { + self.routing_key_indices.clone() + } + + pub fn filter_by_time( + &self, + batch: RecordBatch, + cutoff: Option, + ) -> Result { + let Some(cutoff) = cutoff else { + // no watermark, so we just return the same batch. + return Ok(batch); + }; + // filter out late data + let timestamp_column = batch + .column(self.timestamp_index) + .as_any() + .downcast_ref::() + .ok_or_else(|| ArrowError::CastError( + format!("failed to downcast column {} of {:?} to timestamp. Schema is supposed to be {:?}", + self.timestamp_index, batch, self.schema)))?; + let cutoff_scalar = TimestampNanosecondArray::new_scalar(to_nanos(cutoff) as i64); + let on_time = gt_eq(timestamp_column, &cutoff_scalar)?; + filter_record_batch(&batch, &on_time) + } + + pub fn sort_columns(&self, batch: &RecordBatch, with_timestamp: bool) -> Vec { + let mut columns = vec![]; + if let Some(keys) = &self.key_indices { + columns.extend(keys.iter().map(|index| SortColumn { + values: batch.column(*index).clone(), + options: None, + })); + } + if with_timestamp { + columns.push(SortColumn { + values: batch.column(self.timestamp_index).clone(), + options: None, + }); + } + columns + } + + pub fn sort_fields(&self, with_timestamp: bool) -> Vec { + let mut sort_fields = vec![]; + if let Some(keys) = &self.key_indices { + sort_fields.extend(keys.iter()); + } + if with_timestamp { + sort_fields.push(self.timestamp_index); + } + self.sort_fields_by_indices(&sort_fields) + } + + fn sort_fields_by_indices(&self, indices: &[usize]) -> Vec { + indices + .iter() + .map(|index| SortField::new(self.schema.field(*index).data_type().clone())) + .collect() + } + + pub fn converter(&self, with_timestamp: bool) -> Result { + Converter::new(self.sort_fields(with_timestamp)) + } + + pub fn value_converter( + &self, + with_timestamp: bool, + generation_index: usize, + ) -> Result { + match &self.key_indices { + None => { + let mut indices = (0..self.schema.fields().len()).collect::>(); + indices.remove(generation_index); + if !with_timestamp { + indices.remove(self.timestamp_index); + } + Converter::new(self.sort_fields_by_indices(&indices)) + } + Some(keys) => { + let indices = (0..self.schema.fields().len()) + .filter(|index| { + !keys.contains(index) + && (with_timestamp || *index != self.timestamp_index) + && *index != generation_index + }) + .collect::>(); + Converter::new(self.sort_fields_by_indices(&indices)) + } + } + } + + pub fn value_indices(&self, with_timestamp: bool) -> Vec { + let field_count = self.schema.fields().len(); + match &self.key_indices { + None => { + let mut indices = (0..field_count).collect::>(); + + if !with_timestamp { + indices.remove(self.timestamp_index); + } + indices + } + Some(keys) => (0..field_count) + .filter(|index| { + !keys.contains(index) && (with_timestamp || *index != self.timestamp_index) + }) + .collect::>(), + } + } + + pub fn sort( + &self, + batch: RecordBatch, + with_timestamp: bool, + ) -> Result { + if self.key_indices.is_none() && !with_timestamp { + return Ok(batch); + } + let sort_columns = self.sort_columns(&batch, with_timestamp); + let sort_indices = lexsort_to_indices(&sort_columns, None).expect("should be able to sort"); + let columns = batch + .columns() + .iter() + .map(|c| take(c, &sort_indices, None).unwrap()) + .collect(); + + RecordBatch::try_new(batch.schema(), columns) + } + + pub fn partition( + &self, + batch: &RecordBatch, + with_timestamp: bool, + ) -> Result>, ArrowError> { + if self.key_indices.is_none() && !with_timestamp { + #[allow(clippy::single_range_in_vec_init)] + return Ok(vec![0..batch.num_rows()]); + } + + let mut partition_columns = vec![]; + + if let Some(keys) = &self.routing_keys() { + partition_columns.extend(keys.iter().map(|index| batch.column(*index).clone())); + } + if with_timestamp { + partition_columns.push(batch.column(self.timestamp_index).clone()); + } + + Ok(partition(&partition_columns)?.ranges()) + } + + pub fn unkeyed_batch(&self, batch: &RecordBatch) -> Result { + if self.key_indices.is_none() { + return Ok(batch.clone()); + } + let columns: Vec<_> = (0..batch.num_columns()) + .filter(|index| !self.key_indices.as_ref().unwrap().contains(index)) + .collect(); + batch.project(&columns) + } + + pub fn schema_without_keys(&self) -> Result { + if self.key_indices.is_none() { + return Ok(self.clone()); + } + let key_indices = self.key_indices.as_ref().unwrap(); + let unkeyed_schema = Schema::new( + self.schema + .fields() + .iter() + .enumerate() + .filter(|(index, _field)| !key_indices.contains(index)) + .map(|(_, field)| field.as_ref().clone()) + .collect::>(), + ); + let timestamp_index = unkeyed_schema.index_of(TIMESTAMP_FIELD)?; + Ok(Self { + schema: Arc::new(unkeyed_schema), + timestamp_index, + key_indices: None, + routing_key_indices: None, + }) + } + + pub fn with_fields(&self, fields: Vec) -> Result { + let schema = Arc::new(Schema::new_with_metadata( + fields, + self.schema.metadata.clone(), + )); + + let timestamp_index = schema.index_of(TIMESTAMP_FIELD)?; + let max_index = *[&self.key_indices, &self.routing_key_indices] + .iter() + .map(|indices| indices.as_ref().and_then(|k| k.iter().max())) + .max() + .flatten() + .unwrap_or(&0); + + if schema.fields.len() - 1 < max_index { + return Err(ArrowError::InvalidArgumentError(format!( + "expected at least {} fields, but were only {}", + max_index + 1, + schema.fields.len() + ))); + } + + Ok(Self { + schema, + timestamp_index, + key_indices: self.key_indices.clone(), + routing_key_indices: self.routing_key_indices.clone(), + }) + } + + pub fn with_additional_fields( + &self, + new_fields: impl Iterator, + ) -> Result { + let mut fields = self.schema.fields.to_vec(); + fields.extend(new_fields.map(Arc::new)); + + self.with_fields(fields) + } +} + +pub fn server_for_hash_array( + hash: &PrimitiveArray, + n: usize, +) -> Result, ArrowError> { + let range_size = u64::MAX / (n as u64) + 1; + let range_scalar = UInt64Array::new_scalar(range_size); + let division = div(hash, &range_scalar)?; + let result: &PrimitiveArray = division.as_any().downcast_ref().unwrap(); + Ok(result.clone()) +} diff --git a/src/sql/common/kafka_catalog.rs b/src/sql/common/kafka_catalog.rs new file mode 100644 index 00000000..51ceee67 --- /dev/null +++ b/src/sql/common/kafka_catalog.rs @@ -0,0 +1,116 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct KafkaTable { + pub topic: String, + #[serde(flatten)] + pub kind: TableType, + #[serde(default)] + pub client_configs: HashMap, + pub value_subject: Option, +} + +impl KafkaTable { + pub fn subject(&self) -> String { + self.value_subject + .clone() + .unwrap_or_else(|| format!("{}-value", self.topic)) + } +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(tag = "type", rename_all = "snake_case")] +pub enum TableType { + Source { + offset: KafkaTableSourceOffset, + read_mode: Option, + group_id: Option, + group_id_prefix: Option, + }, + Sink { + commit_mode: SinkCommitMode, + key_field: Option, + timestamp_field: Option, + }, +} + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)] +#[serde(rename_all = "snake_case")] +pub enum KafkaTableSourceOffset { + Latest, + Earliest, + #[default] + Group, +} + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum ReadMode { + ReadUncommitted, + ReadCommitted, +} + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)] +#[serde(rename_all = "snake_case")] +pub enum SinkCommitMode { + #[default] + AtLeastOnce, + ExactlyOnce, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct KafkaConfig { + pub bootstrap_servers: String, + #[serde(default)] + pub authentication: KafkaConfigAuthentication, + #[serde(default)] + pub schema_registry_enum: Option, + #[serde(default)] + pub connection_properties: HashMap, +} + +#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)] +#[serde(tag = "type")] +pub enum KafkaConfigAuthentication { + #[default] + #[serde(rename = "None")] + None, + #[serde(rename = "AWS_MSK_IAM")] + AwsMskIam { region: String }, + #[serde(rename = "SASL")] + Sasl { + protocol: String, + mechanism: String, + username: String, + password: String, + }, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(tag = "type")] +pub enum SchemaRegistryConfig { + #[serde(rename = "None")] + None, + #[serde(rename = "Confluent Schema Registry")] + ConfluentSchemaRegistry { + endpoint: String, + #[serde(rename = "apiKey")] + api_key: Option, + #[serde(rename = "apiSecret")] + api_secret: Option, + }, +} diff --git a/src/sql/common/mod.rs b/src/sql/common/mod.rs new file mode 100644 index 00000000..af44cb0f --- /dev/null +++ b/src/sql/common/mod.rs @@ -0,0 +1,65 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Shared core types and constants for FunctionStream (`crate::sql::common`). +//! +//! Used by the runtime, SQL planner, coordinator, and other subsystems — +//! analogous to `arroyo-types` + `arroyo-rpc` in Arroyo. + +pub mod arrow_ext; +pub mod connector_options; +pub mod constants; +pub mod control; +pub mod converter; +pub mod date; +pub mod debezium; +pub mod errors; +pub mod format_from_opts; +pub mod formats; +pub mod fs_schema; +pub mod kafka_catalog; +pub mod operator_config; +pub mod time_utils; +pub mod topology; +pub mod with_option_keys; + +// ── Re-exports from existing modules ── +pub use crate::runtime::streaming::protocol::{CheckpointBarrier, Watermark}; +pub use arrow_ext::FsExtensionType; +pub use time_utils::{from_nanos, to_micros, to_millis, to_nanos}; + +// ── Re-exports from new modules ── +pub use connector_options::ConnectorOptions; +pub use formats::{BadData, Format, Framing, JsonCompression, JsonFormat}; +pub use fs_schema::{FsSchema, FsSchemaRef}; +pub use operator_config::MetadataField; + +// ── Well-known column names ── +pub use constants::sql_field::{TIMESTAMP_FIELD, UPDATING_META_FIELD}; +pub use topology::render_program_topology; + +// ── Environment variables ── +pub const JOB_ID_ENV: &str = "JOB_ID"; +pub const RUN_ID_ENV: &str = "RUN_ID"; + +// ── Metric names ── +pub const MESSAGES_RECV: &str = "fs_worker_messages_recv"; +pub const MESSAGES_SENT: &str = "fs_worker_messages_sent"; +pub const BYTES_RECV: &str = "fs_worker_bytes_recv"; +pub const BYTES_SENT: &str = "fs_worker_bytes_sent"; +pub const BATCHES_RECV: &str = "fs_worker_batches_recv"; +pub const BATCHES_SENT: &str = "fs_worker_batches_sent"; +pub const TX_QUEUE_SIZE: &str = "fs_worker_tx_queue_size"; +pub const TX_QUEUE_REM: &str = "fs_worker_tx_queue_rem"; +pub const DESERIALIZATION_ERRORS: &str = "fs_worker_deserialization_errors"; + +pub const LOOKUP_KEY_INDEX_FIELD: &str = "__lookup_key_index"; diff --git a/src/sql/common/operator_config.rs b/src/sql/common/operator_config.rs new file mode 100644 index 00000000..b5360cd7 --- /dev/null +++ b/src/sql/common/operator_config.rs @@ -0,0 +1,12 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. + +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct MetadataField { + pub field_name: String, + pub key: String, + #[serde(default)] + pub data_type: Option, +} diff --git a/src/sql/common/time_utils.rs b/src/sql/common/time_utils.rs new file mode 100644 index 00000000..323445cd --- /dev/null +++ b/src/sql/common/time_utils.rs @@ -0,0 +1,74 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::hash::Hash; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; + +pub fn to_millis(time: SystemTime) -> u64 { + time.duration_since(UNIX_EPOCH).unwrap().as_millis() as u64 +} + +pub fn to_micros(time: SystemTime) -> u64 { + time.duration_since(UNIX_EPOCH).unwrap().as_micros() as u64 +} + +pub fn from_millis(ts: u64) -> SystemTime { + UNIX_EPOCH + Duration::from_millis(ts) +} + +pub fn from_micros(ts: u64) -> SystemTime { + UNIX_EPOCH + Duration::from_micros(ts) +} + +pub fn to_nanos(time: SystemTime) -> u128 { + time.duration_since(UNIX_EPOCH).unwrap().as_nanos() +} + +pub fn from_nanos(ts: u128) -> SystemTime { + UNIX_EPOCH + + Duration::from_secs((ts / 1_000_000_000) as u64) + + Duration::from_nanos((ts % 1_000_000_000) as u64) +} + +pub fn print_time(time: SystemTime) -> String { + chrono::DateTime::::from(time) + .format("%Y-%m-%d %H:%M:%S%.3f") + .to_string() +} + +/// Returns the number of days since the UNIX epoch (for Avro serialization). +pub fn days_since_epoch(time: SystemTime) -> i32 { + time.duration_since(UNIX_EPOCH) + .unwrap() + .as_secs() + .div_euclid(86400) as i32 +} + +pub fn single_item_hash_map, K: Hash + Eq, V>(key: I, value: V) -> HashMap { + let mut map = HashMap::new(); + map.insert(key.into(), value); + map +} + +pub fn string_to_map(s: &str, pair_delimiter: char) -> Option> { + if s.trim().is_empty() { + return Some(HashMap::new()); + } + + s.split(',') + .map(|s| { + let mut kv = s.trim().split(pair_delimiter); + Some((kv.next()?.trim().to_string(), kv.next()?.trim().to_string())) + }) + .collect() +} diff --git a/src/sql/common/topology.rs b/src/sql/common/topology.rs new file mode 100644 index 00000000..3b4f892f --- /dev/null +++ b/src/sql/common/topology.rs @@ -0,0 +1,295 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::{BTreeMap, VecDeque}; +use std::fmt::Write; + +use protocol::function_stream_graph::FsProgram; + +fn edge_type_label(edge_type: i32) -> &'static str { + match edge_type { + 1 => "Forward", + 2 => "Shuffle", + 3 => "LeftJoin", + 4 => "RightJoin", + _ => "Unknown", + } +} + +pub fn render_program_topology(program: &FsProgram) -> String { + if program.nodes.is_empty() { + return "(empty topology)".to_string(); + } + + struct EdgeInfo { + target: i32, + edge_type: i32, + } + struct InputInfo { + source: i32, + edge_type: i32, + } + + let node_map: BTreeMap = + program.nodes.iter().map(|n| (n.node_index, n)).collect(); + + let mut downstream: BTreeMap> = BTreeMap::new(); + let mut upstream: BTreeMap> = BTreeMap::new(); + let mut in_degree: BTreeMap = BTreeMap::new(); + + for idx in node_map.keys() { + in_degree.entry(*idx).or_insert(0); + } + for edge in &program.edges { + downstream.entry(edge.source).or_default().push(EdgeInfo { + target: edge.target, + edge_type: edge.edge_type, + }); + upstream.entry(edge.target).or_default().push(InputInfo { + source: edge.source, + edge_type: edge.edge_type, + }); + *in_degree.entry(edge.target).or_insert(0) += 1; + } + + // Kahn's topological sort + let mut queue: VecDeque = in_degree + .iter() + .filter(|(_, deg)| **deg == 0) + .map(|(idx, _)| *idx) + .collect(); + let mut topo_order: Vec = Vec::with_capacity(node_map.len()); + let mut remaining = in_degree.clone(); + while let Some(idx) = queue.pop_front() { + topo_order.push(idx); + if let Some(edges) = downstream.get(&idx) { + for e in edges { + if let Some(deg) = remaining.get_mut(&e.target) { + *deg -= 1; + if *deg == 0 { + queue.push_back(e.target); + } + } + } + } + } + for idx in node_map.keys() { + if !topo_order.contains(idx) { + topo_order.push(*idx); + } + } + + let is_source = |idx: &i32| upstream.get(idx).is_none_or(|v| v.is_empty()); + let is_sink = |idx: &i32| downstream.get(idx).is_none_or(|v| v.is_empty()); + + let mut out = String::new(); + let _ = writeln!( + out, + "Pipeline Topology ({} nodes, {} edges)", + program.nodes.len(), + program.edges.len(), + ); + let _ = writeln!(out, "{}", "=".repeat(50)); + + for (pos, &node_idx) in topo_order.iter().enumerate() { + let Some(node) = node_map.get(&node_idx) else { + continue; + }; + + let op_chain: String = node + .operators + .iter() + .map(|op| op.operator_name.as_str()) + .collect::>() + .join(" -> "); + + let role = if is_source(&node_idx) { + "Source" + } else if is_sink(&node_idx) { + "Sink" + } else { + "Operator" + }; + + let _ = writeln!(out); + let _ = writeln!( + out, + "[{role}] Node {node_idx} parallelism = {}", + node.parallelism, + ); + let _ = writeln!(out, " operators: {op_chain}"); + + if !node.description.is_empty() { + let _ = writeln!(out, " desc: {}", node.description); + } + + if let Some(inputs) = upstream.get(&node_idx) { + if inputs.len() == 1 { + let i = &inputs[0]; + let _ = writeln!( + out, + " input: <-- [{}] Node {}", + edge_type_label(i.edge_type), + i.source, + ); + } else if inputs.len() > 1 { + let _ = writeln!(out, " inputs:"); + for i in inputs { + let _ = writeln!( + out, + " <-- [{}] Node {}", + edge_type_label(i.edge_type), + i.source, + ); + } + } + } + + if let Some(outputs) = downstream.get(&node_idx) { + if outputs.len() == 1 { + let e = &outputs[0]; + let _ = writeln!( + out, + " output: --> [{}] Node {}", + edge_type_label(e.edge_type), + e.target, + ); + } else if outputs.len() > 1 { + let _ = writeln!(out, " outputs:"); + for e in outputs { + let _ = writeln!( + out, + " --> [{}] Node {}", + edge_type_label(e.edge_type), + e.target, + ); + } + } + } + + if pos < topo_order.len() - 1 { + let single_out = downstream.get(&node_idx).is_some_and(|v| v.len() == 1); + let next_idx = topo_order.get(pos + 1).copied(); + let is_direct = single_out + && next_idx + .is_some_and(|n| downstream.get(&node_idx).is_some_and(|v| v[0].target == n)); + let next_single_in = next_idx + .and_then(|n| upstream.get(&n)) + .is_some_and(|v| v.len() == 1); + + if is_direct && next_single_in { + let etype = downstream.get(&node_idx).unwrap()[0].edge_type; + let _ = writeln!(out, " |"); + let _ = writeln!(out, " | {}", edge_type_label(etype)); + let _ = writeln!(out, " v"); + } + } + } + + out.trim_end().to_string() +} + +#[cfg(test)] +mod tests { + use super::*; + use protocol::function_stream_graph::{ChainedOperator, FsEdge, FsNode, FsProgram}; + + fn make_node( + node_index: i32, + operators: Vec<(&str, &str)>, + desc: &str, + parallelism: u32, + ) -> FsNode { + FsNode { + node_index, + node_id: node_index as u32, + parallelism, + description: desc.to_string(), + operators: operators + .into_iter() + .map(|(id, name)| ChainedOperator { + operator_id: id.to_string(), + operator_name: name.to_string(), + operator_config: Vec::new(), + }) + .collect(), + edges: Vec::new(), + } + } + + fn make_edge(source: i32, target: i32, edge_type: i32) -> FsEdge { + FsEdge { + source, + target, + schema: None, + edge_type, + } + } + + #[test] + fn empty_program_renders_placeholder() { + let program = FsProgram { + nodes: vec![], + edges: vec![], + program_config: None, + }; + assert_eq!(render_program_topology(&program), "(empty topology)"); + } + + #[test] + fn linear_pipeline_renders_correctly() { + let program = FsProgram { + nodes: vec![ + make_node(0, vec![("src_0", "ConnectorSource")], "", 1), + make_node( + 1, + vec![("val_1", "Value"), ("wm_2", "ExpressionWatermark")], + "source -> watermark", + 1, + ), + make_node(2, vec![("sink_3", "ConnectorSink")], "sink (kafka)", 1), + ], + edges: vec![make_edge(0, 1, 1), make_edge(1, 2, 1)], + program_config: None, + }; + let result = render_program_topology(&program); + assert!(result.contains("[Source] Node 0")); + assert!(result.contains("[Operator] Node 1")); + assert!(result.contains("[Sink] Node 2")); + assert!(result.contains("ConnectorSource")); + assert!(result.contains("Value -> ExpressionWatermark")); + assert!(result.contains("Forward")); + } + + #[test] + fn join_topology_shows_multiple_inputs() { + let program = FsProgram { + nodes: vec![ + make_node(0, vec![("src_a", "ConnectorSource")], "source A", 1), + make_node(1, vec![("src_b", "ConnectorSource")], "source B", 1), + make_node(2, vec![("join_0", "WindowJoin")], "join node", 2), + make_node(3, vec![("sink_0", "ConnectorSink")], "sink", 1), + ], + edges: vec![ + make_edge(0, 2, 3), // LeftJoin + make_edge(1, 2, 4), // RightJoin + make_edge(2, 3, 1), // Forward + ], + program_config: None, + }; + let result = render_program_topology(&program); + assert!(result.contains("inputs:")); + assert!(result.contains("LeftJoin")); + assert!(result.contains("RightJoin")); + assert!(result.contains("[Operator] Node 2")); + } +} diff --git a/src/sql/common/with_option_keys.rs b/src/sql/common/with_option_keys.rs new file mode 100644 index 00000000..b998d1eb --- /dev/null +++ b/src/sql/common/with_option_keys.rs @@ -0,0 +1,86 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub const CONNECTOR: &str = "connector"; +pub const TYPE: &str = "type"; +pub const FORMAT: &str = "format"; +pub const DEFAULT_FORMAT_VALUE: &str = "json"; +pub const BAD_DATA: &str = "bad_data"; +pub const PARTITION_BY: &str = "partition_by"; + +pub const EVENT_TIME_FIELD: &str = "event_time_field"; +pub const WATERMARK_FIELD: &str = "watermark_field"; + +pub const IDLE_MICROS: &str = "idle_micros"; +pub const IDLE_TIME: &str = "idle_time"; + +pub const LOOKUP_CACHE_MAX_BYTES: &str = "lookup.cache.max_bytes"; +pub const LOOKUP_CACHE_TTL: &str = "lookup.cache.ttl"; + +pub const CONNECTION_SCHEMA: &str = "connection_schema"; + +pub const ADAPTER: &str = "adapter"; + +// ── Kafka ───────────────────────────────────────────────────────────────── + +pub const KAFKA_BOOTSTRAP_SERVERS: &str = "bootstrap.servers"; +pub const KAFKA_BOOTSTRAP_SERVERS_LEGACY: &str = "bootstrap_servers"; +pub const KAFKA_TOPIC: &str = "topic"; +pub const KAFKA_RATE_LIMIT_MESSAGES_PER_SECOND: &str = "rate_limit.messages_per_second"; +pub const KAFKA_VALUE_SUBJECT: &str = "value.subject"; +pub const KAFKA_SCAN_STARTUP_MODE: &str = "scan.startup.mode"; +pub const KAFKA_ISOLATION_LEVEL: &str = "isolation.level"; +pub const KAFKA_GROUP_ID: &str = "group.id"; +pub const KAFKA_GROUP_ID_LEGACY: &str = "group_id"; +pub const KAFKA_GROUP_ID_PREFIX: &str = "group.id.prefix"; +pub const KAFKA_SINK_COMMIT_MODE: &str = "sink.commit.mode"; +pub const KAFKA_SINK_KEY_FIELD: &str = "sink.key.field"; +pub const KAFKA_KEY_FIELD_LEGACY: &str = "key.field"; +pub const KAFKA_SINK_TIMESTAMP_FIELD: &str = "sink.timestamp.field"; +pub const KAFKA_TIMESTAMP_FIELD_LEGACY: &str = "timestamp.field"; + +// ── JSON format ─────────────────────────────────────────────────────────── + +pub const JSON_CONFLUENT_SCHEMA_REGISTRY: &str = "json.confluent_schema_registry"; +pub const JSON_CONFLUENT_SCHEMA_VERSION: &str = "json.confluent_schema_version"; +pub const JSON_INCLUDE_SCHEMA: &str = "json.include_schema"; +pub const JSON_DEBEZIUM: &str = "json.debezium"; +pub const JSON_UNSTRUCTURED: &str = "json.unstructured"; +pub const JSON_TIMESTAMP_FORMAT: &str = "json.timestamp_format"; +pub const JSON_DECIMAL_ENCODING: &str = "json.decimal_encoding"; +pub const JSON_COMPRESSION: &str = "json.compression"; + +// ── Avro ────────────────────────────────────────────────────────────────── + +pub const AVRO_CONFLUENT_SCHEMA_REGISTRY: &str = "avro.confluent_schema_registry"; +pub const AVRO_RAW_DATUMS: &str = "avro.raw_datums"; +pub const AVRO_INTO_UNSTRUCTURED_JSON: &str = "avro.into_unstructured_json"; +pub const AVRO_SCHEMA_ID: &str = "avro.schema_id"; + +// ── Parquet ─────────────────────────────────────────────────────────────── + +pub const PARQUET_COMPRESSION: &str = "parquet.compression"; +pub const PARQUET_ROW_GROUP_BYTES: &str = "parquet.row_group_bytes"; + +// ── Protobuf ──────────────────────────────────────────────────────────────── + +pub const PROTOBUF_INTO_UNSTRUCTURED_JSON: &str = "protobuf.into_unstructured_json"; +pub const PROTOBUF_MESSAGE_NAME: &str = "protobuf.message_name"; +pub const PROTOBUF_CONFLUENT_SCHEMA_REGISTRY: &str = "protobuf.confluent_schema_registry"; +pub const PROTOBUF_LENGTH_DELIMITED: &str = "protobuf.length_delimited"; + +// ── Framing ───────────────────────────────────────────────────────────────── + +pub const FRAMING_METHOD: &str = "framing.method"; +pub const FRAMING_MAX_LINE_LENGTH: &str = "framing.max_line_length"; + +pub const FORMAT_DEBEZIUM_FLAG: &str = "format.debezium"; diff --git a/src/sql/functions/mod.rs b/src/sql/functions/mod.rs new file mode 100644 index 00000000..b78f5d2a --- /dev/null +++ b/src/sql/functions/mod.rs @@ -0,0 +1,612 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::sql::schema::StreamSchemaProvider; +use datafusion::arrow::array::{ + Array, ArrayRef, StringArray, UnionArray, + builder::{FixedSizeBinaryBuilder, ListBuilder, StringBuilder}, + cast::{AsArray, as_string_array}, + types::{Float64Type, Int64Type}, +}; +use datafusion::arrow::datatypes::{DataType, Field, UnionFields, UnionMode}; +use datafusion::arrow::row::{RowConverter, SortField}; +use datafusion::common::{DataFusionError, ScalarValue}; +use datafusion::common::{Result, TableReference}; +use datafusion::execution::FunctionRegistry; +use datafusion::logical_expr::expr::{Alias, ScalarFunction}; +use datafusion::logical_expr::{ + ColumnarValue, LogicalPlan, Projection, ScalarFunctionArgs, ScalarUDFImpl, Signature, + TypeSignature, Volatility, create_udf, +}; +use datafusion::prelude::{Expr, col}; +use serde_json_path::JsonPath; +use std::any::Any; +use std::collections::HashMap; +use std::fmt::{Debug, Write}; +use std::sync::{Arc, OnceLock}; + +use crate::sql::common::constants::scalar_fn; + +/// Borrowed from DataFusion +/// +/// Creates a singleton `ScalarUDF` of the `$UDF` function named `$GNAME` and a +/// function named `$NAME` which returns that function named $NAME. +/// +/// This is used to ensure creating the list of `ScalarUDF` only happens once. +#[macro_export] +macro_rules! make_udf_function { + ($UDF:ty, $GNAME:ident, $NAME:ident) => { + /// Singleton instance of the function + static $GNAME: std::sync::OnceLock> = + std::sync::OnceLock::new(); + + /// Return a [`ScalarUDF`] for [`$UDF`] + /// + /// [`ScalarUDF`]: datafusion_expr::ScalarUDF + pub fn $NAME() -> std::sync::Arc { + $GNAME + .get_or_init(|| { + std::sync::Arc::new(datafusion::logical_expr::ScalarUDF::new_from_impl( + <$UDF>::default(), + )) + }) + .clone() + } + }; +} + +make_udf_function!(MultiHashFunction, MULTI_HASH, multi_hash); + +pub fn register_all(registry: &mut dyn FunctionRegistry) { + registry + .register_udf(Arc::new(create_udf( + scalar_fn::GET_FIRST_JSON_OBJECT, + vec![DataType::Utf8, DataType::Utf8], + DataType::Utf8, + Volatility::Immutable, + Arc::new(get_first_json_object), + ))) + .unwrap(); + + registry + .register_udf(Arc::new(create_udf( + scalar_fn::EXTRACT_JSON, + vec![DataType::Utf8, DataType::Utf8], + DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), + Volatility::Immutable, + Arc::new(extract_json), + ))) + .unwrap(); + + registry + .register_udf(Arc::new(create_udf( + scalar_fn::EXTRACT_JSON_STRING, + vec![DataType::Utf8, DataType::Utf8], + DataType::Utf8, + Volatility::Immutable, + Arc::new(extract_json_string), + ))) + .unwrap(); + + registry + .register_udf(Arc::new(create_udf( + scalar_fn::SERIALIZE_JSON_UNION, + vec![DataType::Union(union_fields(), UnionMode::Sparse)], + DataType::Utf8, + Volatility::Immutable, + Arc::new(serialize_json_union), + ))) + .unwrap(); + + registry.register_udf(multi_hash()).unwrap(); +} + +fn parse_path(name: &str, path: &ScalarValue) -> Result> { + let path = match path { + ScalarValue::Utf8(Some(s)) => JsonPath::parse(s) + .map_err(|e| DataFusionError::Execution(format!("Invalid json path '{s}': {e:?}")))?, + ScalarValue::Utf8(None) => { + return Err(DataFusionError::Execution(format!( + "The path argument to {name} cannot be null" + ))); + } + _ => { + return Err(DataFusionError::Execution(format!( + "The path argument to {name} must be of type TEXT" + ))); + } + }; + + Ok(Arc::new(path)) +} + +// Hash function that can take any number of arguments and produces a fast (non-cryptographic) +// 128-bit hash from their string representations +#[derive(Debug)] +pub struct MultiHashFunction { + signature: Signature, +} + +impl MultiHashFunction { + pub fn invoke(&self, args: &[ColumnarValue]) -> Result { + let mut hasher = xxhash_rust::xxh3::Xxh3::new(); + + let all_scalar = args.iter().all(|a| matches!(a, ColumnarValue::Scalar(_))); + + let length = args + .iter() + .map(|t| match t { + ColumnarValue::Scalar(_) => 1, + ColumnarValue::Array(a) => a.len(), + }) + .max() + .ok_or_else(|| { + DataFusionError::Plan("multi_hash must have at least one argument".to_string()) + })?; + + let row_builder = RowConverter::new( + args.iter() + .map(|t| SortField::new(t.data_type().clone())) + .collect(), + )?; + + let arrays = args + .iter() + .map(|c| c.clone().into_array(length)) + .collect::>>()?; + let rows = row_builder.convert_columns(&arrays)?; + + if all_scalar { + hasher.update(rows.row(0).as_ref()); + let result = hasher.digest128().to_be_bytes().to_vec(); + hasher.reset(); + Ok(ColumnarValue::Scalar(ScalarValue::FixedSizeBinary( + size_of::() as i32, + Some(result), + ))) + } else { + let mut builder = + FixedSizeBinaryBuilder::with_capacity(length, size_of::() as i32); + + for row in rows.iter() { + hasher.update(row.as_ref()); + builder.append_value(hasher.digest128().to_be_bytes())?; + hasher.reset(); + } + + Ok(ColumnarValue::Array(Arc::new(builder.finish()))) + } + } +} + +impl Default for MultiHashFunction { + fn default() -> Self { + Self { + signature: Signature::new(TypeSignature::VariadicAny, Volatility::Immutable), + } + } +} + +impl ScalarUDFImpl for MultiHashFunction { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + scalar_fn::MULTI_HASH + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::FixedSizeBinary(size_of::() as i32)) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + self.invoke(&args.args) + } +} + +fn json_function( + name: &str, + f: F, + to_scalar: ToS, + args: &[ColumnarValue], +) -> Result +where + ArrayT: Array + FromIterator> + 'static, + F: Fn(serde_json::Value, &JsonPath) -> Option, + ToS: Fn(Option) -> ScalarValue, +{ + assert_eq!(args.len(), 2); + Ok(match (&args[0], &args[1]) { + (ColumnarValue::Array(values), ColumnarValue::Scalar(path)) => { + let path = parse_path(name, path)?; + let vs = as_string_array(values); + ColumnarValue::Array(Arc::new( + vs.iter() + .map(|s| s.and_then(|s| f(serde_json::from_str(s).ok()?, &path))) + .collect::(), + ) as ArrayRef) + } + (ColumnarValue::Scalar(value), ColumnarValue::Scalar(path)) => { + let path = parse_path(name, path)?; + let ScalarValue::Utf8(value) = value else { + return Err(DataFusionError::Execution(format!( + "The value argument to {name} must be of type TEXT" + ))); + }; + + let result = value + .as_ref() + .and_then(|v| f(serde_json::from_str(v).ok()?, &path)); + ColumnarValue::Scalar(to_scalar(result)) + } + _ => { + return Err(DataFusionError::Execution( + "The path argument to {name} must be a literal".to_string(), + )); + } + }) +} + +pub fn extract_json(args: &[ColumnarValue]) -> Result { + assert_eq!(args.len(), 2); + + let inner = |s, path: &JsonPath| { + Some( + path.query(&serde_json::from_str(s).ok()?) + .iter() + .map(|v| Some(v.to_string())) + .collect::>>(), + ) + }; + + Ok(match (&args[0], &args[1]) { + (ColumnarValue::Array(values), ColumnarValue::Scalar(path)) => { + let path = parse_path("extract_json", path)?; + let values = as_string_array(values); + + let mut builder = ListBuilder::with_capacity(StringBuilder::new(), values.len()); + + let queried = values.iter().map(|s| s.and_then(|s| inner(s, &path))); + + for v in queried { + builder.append_option(v); + } + + ColumnarValue::Array(Arc::new(builder.finish())) + } + (ColumnarValue::Scalar(value), ColumnarValue::Scalar(path)) => { + let path = parse_path("extract_json", path)?; + let ScalarValue::Utf8(v) = value else { + return Err(DataFusionError::Execution( + "The value argument to extract_json must be of type TEXT".to_string(), + )); + }; + + let mut builder = ListBuilder::with_capacity(StringBuilder::new(), 1); + let result = v.as_ref().and_then(|s| inner(s, &path)); + builder.append_option(result); + + ColumnarValue::Scalar(ScalarValue::List(Arc::new(builder.finish()))) + } + _ => { + return Err(DataFusionError::Execution( + "The path argument to extract_json must be a literal".to_string(), + )); + } + }) +} + +pub fn get_first_json_object(args: &[ColumnarValue]) -> Result { + json_function::( + "get_first_json_object", + |s, path| path.query(&s).first().map(|v| v.to_string()), + |s| s.as_deref().into(), + args, + ) +} + +pub fn extract_json_string(args: &[ColumnarValue]) -> Result { + json_function::( + "extract_json_string", + |s, path| { + path.query(&s) + .first() + .and_then(|v| v.as_str().map(|s| s.to_string())) + }, + |s| s.as_deref().into(), + args, + ) +} + +// This code is vendored from +// https://github.com/datafusion-contrib/datafusion-functions-json/blob/main/src/common_union.rs +// as the `is_json_union` function is not public. It should be kept in sync with that code so +// that we are able to detect JSON unions and rewrite them to serialized JSON for sinks. +pub(crate) fn is_json_union(data_type: &DataType) -> bool { + match data_type { + DataType::Union(fields, UnionMode::Sparse) => fields == &union_fields(), + _ => false, + } +} + +pub(crate) const TYPE_ID_NULL: i8 = 0; +const TYPE_ID_BOOL: i8 = 1; +const TYPE_ID_INT: i8 = 2; +const TYPE_ID_FLOAT: i8 = 3; +const TYPE_ID_STR: i8 = 4; +const TYPE_ID_ARRAY: i8 = 5; +const TYPE_ID_OBJECT: i8 = 6; + +fn union_fields() -> UnionFields { + static FIELDS: OnceLock = OnceLock::new(); + FIELDS + .get_or_init(|| { + let json_metadata: HashMap = + HashMap::from_iter(vec![("is_json".to_string(), "true".to_string())]); + UnionFields::from_iter([ + ( + TYPE_ID_NULL, + Arc::new(Field::new("null", DataType::Null, true)), + ), + ( + TYPE_ID_BOOL, + Arc::new(Field::new("bool", DataType::Boolean, false)), + ), + ( + TYPE_ID_INT, + Arc::new(Field::new("int", DataType::Int64, false)), + ), + ( + TYPE_ID_FLOAT, + Arc::new(Field::new("float", DataType::Float64, false)), + ), + ( + TYPE_ID_STR, + Arc::new(Field::new("str", DataType::Utf8, false)), + ), + ( + TYPE_ID_ARRAY, + Arc::new( + Field::new("array", DataType::Utf8, false) + .with_metadata(json_metadata.clone()), + ), + ), + ( + TYPE_ID_OBJECT, + Arc::new( + Field::new("object", DataType::Utf8, false) + .with_metadata(json_metadata.clone()), + ), + ), + ]) + }) + .clone() +} +// End vendored code + +pub fn serialize_json_union(args: &[ColumnarValue]) -> Result { + assert_eq!(args.len(), 1); + let array = match args.first().unwrap() { + ColumnarValue::Array(a) => a.clone(), + ColumnarValue::Scalar(s) => s.to_array_of_size(1)?, + }; + + let mut b = StringBuilder::with_capacity(array.len(), array.get_array_memory_size()); + + write_union(&mut b, &array)?; + + Ok(ColumnarValue::Array(Arc::new(b.finish()))) +} + +fn write_union(b: &mut StringBuilder, array: &ArrayRef) -> Result<(), std::fmt::Error> { + assert!( + is_json_union(array.data_type()), + "array item is not a valid JSON union" + ); + let json_union = array.as_any().downcast_ref::().unwrap(); + + for i in 0..json_union.len() { + if json_union.is_null(i) { + b.append_null(); + } else { + write_value(b, json_union.type_id(i), &json_union.value(i))?; + b.append_value(""); + } + } + + Ok(()) +} + +fn write_value(b: &mut StringBuilder, id: i8, a: &ArrayRef) -> Result<(), std::fmt::Error> { + match id { + TYPE_ID_NULL => write!(b, "null")?, + TYPE_ID_BOOL => write!(b, "{}", a.as_boolean().value(0))?, + TYPE_ID_INT => write!(b, "{}", a.as_primitive::().value(0))?, + TYPE_ID_FLOAT => write!(b, "{}", a.as_primitive::().value(0))?, + TYPE_ID_STR => { + // assumes that this is already a valid (escaped) json string as the only way to + // construct these values are by parsing (valid) JSON + b.write_char('"')?; + b.write_str(a.as_string::().value(0))?; + b.write_char('"')?; + } + TYPE_ID_ARRAY => { + b.write_str(a.as_string::().value(0))?; + } + TYPE_ID_OBJECT => { + b.write_str(a.as_string::().value(0))?; + } + _ => unreachable!("invalid union type in JSON union: {}", id), + } + + Ok(()) +} + +pub(crate) fn serialize_outgoing_json( + registry: &StreamSchemaProvider, + node: Arc, +) -> LogicalPlan { + let exprs = node + .schema() + .fields() + .iter() + .map(|f| { + if is_json_union(f.data_type()) { + Expr::Alias(Alias::new( + Expr::ScalarFunction(ScalarFunction::new_udf( + registry.udf(scalar_fn::SERIALIZE_JSON_UNION).unwrap(), + vec![col(f.name())], + )), + Option::::None, + f.name(), + )) + } else { + col(f.name()) + } + }) + .collect(); + + LogicalPlan::Projection(Projection::try_new(exprs, node).unwrap()) +} + +#[cfg(test)] +mod test { + use datafusion::arrow::array::StringArray; + use datafusion::arrow::array::builder::{ListBuilder, StringBuilder}; + use datafusion::common::ScalarValue; + use std::sync::Arc; + + #[test] + fn test_extract_json() { + let input = Arc::new(StringArray::from(vec![ + r#"{"a": 1, "b": 2, "c": { "d": "hello" }}"#, + r#"{"a": 3, "b": 4}"#, + r#"{"a": 5, "b": 6}"#, + ])); + + let path = "$.c.d"; + + let result = super::extract_json(&[ + super::ColumnarValue::Array(input), + super::ColumnarValue::Scalar(path.into()), + ]) + .unwrap(); + + let mut expected = ListBuilder::new(StringBuilder::new()); + expected.append_value(vec![Some("\"hello\"".to_string())]); + expected.append_value(Vec::>::new()); + expected.append_value(Vec::>::new()); + if let super::ColumnarValue::Array(result) = result { + assert_eq!(*result, expected.finish()); + } else { + panic!("Expected array, got scalar"); + } + + let result = super::extract_json(&[ + super::ColumnarValue::Scalar(r#"{"a": 1, "b": 2, "c": { "d": "hello" }}"#.into()), + super::ColumnarValue::Scalar(path.into()), + ]) + .unwrap(); + + let mut expected = ListBuilder::with_capacity(StringBuilder::new(), 1); + expected.append_value(vec![Some("\"hello\"".to_string())]); + + if let super::ColumnarValue::Scalar(ScalarValue::List(result)) = result { + assert_eq!(*result, expected.finish()); + } else { + panic!("Expected scalar list"); + } + } + + #[test] + fn test_get_first_json_object() { + let input = Arc::new(StringArray::from(vec![ + r#"{"a": 1, "b": 2}"#, + r#"{"a": 3}"#, + r#"{"a": 5, "b": 6}"#, + ])); + + let path = "$.b"; + + let result = super::get_first_json_object(&[ + super::ColumnarValue::Array(input), + super::ColumnarValue::Scalar(path.into()), + ]) + .unwrap(); + + let expected = StringArray::from(vec![Some("2"), None, Some("6")]); + + if let super::ColumnarValue::Array(result) = result { + assert_eq!(*result, expected); + } else { + panic!("Expected array, got scalar"); + } + + let result = super::get_first_json_object(&[ + super::ColumnarValue::Scalar(r#"{"a": 1, "b": 2, "c": { "d": "hello" }}"#.into()), + super::ColumnarValue::Scalar("$.c.d".into()), + ]) + .unwrap(); + + let expected = ScalarValue::Utf8(Some("\"hello\"".to_string())); + + if let super::ColumnarValue::Scalar(result) = result { + assert_eq!(result, expected); + } else { + panic!("Expected scalar"); + } + } + + #[test] + fn test_extract_json_string() { + let input = Arc::new(StringArray::from(vec![ + r#"{"a": 1, "b": 2, "c": { "d": "hello" }}"#, + r#"{"a": 3, "b": 4}"#, + r#"{"a": 5, "b": 6}"#, + ])); + + let path = "$.c.d"; + + let result = super::extract_json_string(&[ + super::ColumnarValue::Array(input), + super::ColumnarValue::Scalar(path.into()), + ]) + .unwrap(); + + let expected = StringArray::from(vec![Some("hello"), None, None]); + + if let super::ColumnarValue::Array(result) = result { + assert_eq!(*result, expected); + } else { + panic!("Expected array, got scalar"); + } + + let result = super::extract_json_string(&[ + super::ColumnarValue::Scalar(r#"{"a": 1, "b": 2, "c": { "d": "hello" }}"#.into()), + super::ColumnarValue::Scalar(path.into()), + ]) + .unwrap(); + + let expected = ScalarValue::Utf8(Some("hello".to_string())); + + if let super::ColumnarValue::Scalar(result) = result { + assert_eq!(result, expected); + } else { + panic!("Expected scalar"); + } + } +} diff --git a/src/sql/grammar.pest b/src/sql/grammar.pest deleted file mode 100644 index 15f70dd7..00000000 --- a/src/sql/grammar.pest +++ /dev/null @@ -1,134 +0,0 @@ -// ============================================================================= -// FUNCTION SQL Grammar -// -// Using pest PEG syntax, referencing ANTLR style -// ============================================================================= - -// ============================================================================= -// 1. Whitespace (automatically skipped) -// ============================================================================= - -WHITESPACE = _{ " " | "\t" | "\r" | "\n" } - -// ============================================================================= -// 2. Keywords (case-insensitive) -// ============================================================================= - -kw_create = _{ C ~ R ~ E ~ A ~ T ~ E } -kw_drop = _{ D ~ R ~ O ~ P } -kw_start = _{ S ~ T ~ A ~ R ~ T } -kw_stop = _{ S ~ T ~ O ~ P } -kw_show = _{ S ~ H ~ O ~ W } -kw_with = _{ W ~ I ~ T ~ H } -kw_function = _{ F ~ U ~ N ~ C ~ T ~ I ~ O ~ N } -kw_functions = _{ F ~ U ~ N ~ C ~ T ~ I ~ O ~ N ~ S } - -// ============================================================================= -// 3. Operators & Symbols -// ============================================================================= - -LPAREN = _{ "(" } -RPAREN = _{ ")" } -COMMA = _{ "," } -EQ = _{ "=" } -SQUOTE = _{ "'" } -DQUOTE = _{ "\"" } - -// ============================================================================= -// 4. Literals -// ============================================================================= - -// String literal (single or double quotes) -string_literal = @{ - SQUOTE ~ string_inner_single ~ SQUOTE | - DQUOTE ~ string_inner_double ~ DQUOTE -} - -string_inner_single = @{ (!(SQUOTE | "\\") ~ ANY | escape_seq)* } -string_inner_double = @{ (!(DQUOTE | "\\") ~ ANY | escape_seq)* } -escape_seq = @{ "\\" ~ ANY } - -// ============================================================================= -// 5. Identifiers -// ============================================================================= - -// Task name identifier -identifier = @{ (ASCII_ALPHA | "_") ~ (ASCII_ALPHANUMERIC | "_" | "-")* } - -// ============================================================================= -// 6. Statements -// ============================================================================= - -// Entry rule -statement = _{ - SOI ~ ( - create_stmt | - drop_stmt | - start_stmt | - stop_stmt | - show_stmt - ) ~ EOI -} - -// CREATE FUNCTION WITH (...) -// Note: name is read from config file, not from SQL statement -create_stmt = { kw_create ~ kw_function ~ kw_with ~ properties } - -// DROP FUNCTION name -drop_stmt = { kw_drop ~ kw_function ~ identifier } - -// START FUNCTION name -start_stmt = { kw_start ~ kw_function ~ identifier } - -// STOP FUNCTION name -stop_stmt = { kw_stop ~ kw_function ~ identifier } - -// SHOW FUNCTIONS -show_stmt = { kw_show ~ kw_functions } - -// ============================================================================= -// 7. Properties -// ============================================================================= - -// Property list ('key'='value', ...) -properties = { LPAREN ~ property ~ (COMMA ~ property)* ~ RPAREN } - -// Single property 'key'='value' -property = { property_key ~ EQ ~ property_value } - -// Property key (string) -property_key = { string_literal } - -// Property value (string) -property_value = { string_literal } - -// ============================================================================= -// 8. Character Fragments (for case-insensitive matching) -// ============================================================================= - -A = _{ "A" | "a" } -B = _{ "B" | "b" } -C = _{ "C" | "c" } -D = _{ "D" | "d" } -E = _{ "E" | "e" } -F = _{ "F" | "f" } -G = _{ "G" | "g" } -H = _{ "H" | "h" } -I = _{ "I" | "i" } -J = _{ "J" | "j" } -K = _{ "K" | "k" } -L = _{ "L" | "l" } -M = _{ "M" | "m" } -N = _{ "N" | "n" } -O = _{ "O" | "o" } -P = _{ "P" | "p" } -Q = _{ "Q" | "q" } -R = _{ "R" | "r" } -S = _{ "S" | "s" } -T = _{ "T" | "t" } -U = _{ "U" | "u" } -V = _{ "V" | "v" } -W = _{ "W" | "w" } -X = _{ "X" | "x" } -Y = _{ "Y" | "y" } -Z = _{ "Z" | "z" } diff --git a/src/sql/logical_node/aggregate.rs b/src/sql/logical_node/aggregate.rs new file mode 100644 index 00000000..d9833c50 --- /dev/null +++ b/src/sql/logical_node/aggregate.rs @@ -0,0 +1,637 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Formatter; +use std::sync::Arc; +use std::time::Duration; + +use arrow_array::types::IntervalMonthDayNanoType; +use datafusion::common::{Column, DFSchemaRef, Result, ScalarValue, internal_err}; +use datafusion::logical_expr::{ + self, BinaryExpr, Expr, Extension, LogicalPlan, UserDefinedLogicalNodeCore, + expr::ScalarFunction, +}; +use datafusion_common::{DFSchema, DataFusionError, plan_err}; +use datafusion_expr::Aggregate; +use datafusion_proto::physical_plan::to_proto::serialize_physical_expr; +use datafusion_proto::physical_plan::{AsExecutionPlan, DefaultPhysicalExtensionCodec}; +use datafusion_proto::protobuf::PhysicalPlanNode; +use prost::Message; +use protocol::function_stream_graph::{ + SessionWindowAggregateOperator, SlidingWindowAggregateOperator, TumblingWindowAggregateOperator, +}; + +use crate::multifield_partial_ord; +use crate::sql::common::constants::{extension_node, proto_operator_name}; +use crate::sql::common::{FsSchema, FsSchemaRef}; +use crate::sql::logical_node::logical::{LogicalEdge, LogicalEdgeType, LogicalNode, OperatorName}; +use crate::sql::logical_node::{ + CompiledTopologyNode, StreamingOperatorBlueprint, SystemTimestampInjectorNode, +}; +use crate::sql::logical_planner::planner::{NamedNode, Planner, SplitPlanOutput}; +use crate::sql::physical::{StreamingExtensionCodec, window}; +use crate::sql::types::{ + QualifiedField, TIMESTAMP_FIELD, WindowBehavior, WindowType, build_df_schema, + build_df_schema_with_metadata, extract_qualified_fields, +}; + +pub(crate) const STREAM_AGG_EXTENSION_NAME: &str = extension_node::STREAM_WINDOW_AGGREGATE; + +/// Represents a streaming windowed aggregation node in the logical plan. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) struct StreamWindowAggregateNode { + pub(crate) window_spec: WindowBehavior, + pub(crate) base_agg_plan: LogicalPlan, + pub(crate) output_schema: DFSchemaRef, + pub(crate) partition_keys: Vec, + pub(crate) post_aggregation_plan: LogicalPlan, +} + +multifield_partial_ord!( + StreamWindowAggregateNode, + base_agg_plan, + partition_keys, + post_aggregation_plan +); + +impl StreamWindowAggregateNode { + /// Safely constructs a new node, computing the final projection without panicking. + pub fn try_new( + window_spec: WindowBehavior, + base_agg_plan: LogicalPlan, + partition_keys: Vec, + ) -> Result { + let post_aggregation_plan = + WindowBoundaryMath::build_post_aggregation(&base_agg_plan, window_spec.clone())?; + + Ok(Self { + window_spec, + base_agg_plan, + output_schema: post_aggregation_plan.schema().clone(), + partition_keys, + post_aggregation_plan, + }) + } + + fn build_tumbling_operator( + &self, + planner: &Planner, + node_id: usize, + input_schema: DFSchemaRef, + duration: Duration, + ) -> Result { + let binning_expr = planner.binning_function_proto(duration, input_schema.clone())?; + + let SplitPlanOutput { + partial_aggregation_plan, + partial_schema, + finish_plan, + } = planner.split_physical_plan(self.partition_keys.clone(), &self.base_agg_plan, true)?; + + let final_physical = planner.sync_plan(&self.post_aggregation_plan)?; + let final_physical_proto = PhysicalPlanNode::try_from_physical_plan( + final_physical, + &StreamingExtensionCodec::default(), + )?; + + let operator_config = TumblingWindowAggregateOperator { + name: proto_operator_name::TUMBLING_WINDOW.to_string(), + width_micros: duration.as_micros() as u64, + binning_function: binning_expr.encode_to_vec(), + input_schema: Some( + FsSchema::from_schema_keys( + Arc::new(input_schema.as_ref().into()), + self.partition_keys.clone(), + )? + .into(), + ), + partial_schema: Some(partial_schema.into()), + partial_aggregation_plan: partial_aggregation_plan.encode_to_vec(), + final_aggregation_plan: finish_plan.encode_to_vec(), + final_projection: Some(final_physical_proto.encode_to_vec()), + }; + + Ok(LogicalNode::single( + node_id as u32, + format!("tumbling_{node_id}"), + OperatorName::TumblingWindowAggregate, + operator_config.encode_to_vec(), + format!("TumblingWindow<{}>", operator_config.name), + 1, + )) + } + + fn build_sliding_operator( + &self, + planner: &Planner, + node_id: usize, + input_schema: DFSchemaRef, + duration: Duration, + slide_interval: Duration, + ) -> Result { + let binning_expr = planner.binning_function_proto(slide_interval, input_schema.clone())?; + + let SplitPlanOutput { + partial_aggregation_plan, + partial_schema, + finish_plan, + } = planner.split_physical_plan(self.partition_keys.clone(), &self.base_agg_plan, true)?; + + let final_physical = planner.sync_plan(&self.post_aggregation_plan)?; + let final_physical_proto = PhysicalPlanNode::try_from_physical_plan( + final_physical, + &StreamingExtensionCodec::default(), + )?; + + let operator_config = SlidingWindowAggregateOperator { + name: format!("SlidingWindow<{duration:?}>"), + width_micros: duration.as_micros() as u64, + slide_micros: slide_interval.as_micros() as u64, + binning_function: binning_expr.encode_to_vec(), + input_schema: Some( + FsSchema::from_schema_keys( + Arc::new(input_schema.as_ref().into()), + self.partition_keys.clone(), + )? + .into(), + ), + partial_schema: Some(partial_schema.into()), + partial_aggregation_plan: partial_aggregation_plan.encode_to_vec(), + final_aggregation_plan: finish_plan.encode_to_vec(), + final_projection: final_physical_proto.encode_to_vec(), + }; + + Ok(LogicalNode::single( + node_id as u32, + format!("sliding_window_{node_id}"), + OperatorName::SlidingWindowAggregate, + operator_config.encode_to_vec(), + proto_operator_name::SLIDING_WINDOW_LABEL.to_string(), + 1, + )) + } + + fn build_session_operator( + &self, + planner: &Planner, + node_id: usize, + input_schema: DFSchemaRef, + ) -> Result { + let WindowBehavior::FromOperator { + window: WindowType::Session { gap }, + window_index, + window_field, + is_nested: false, + } = &self.window_spec + else { + return plan_err!("Expected standard session window configuration"); + }; + + let output_fields = extract_qualified_fields(self.base_agg_plan.schema()); + let LogicalPlan::Aggregate(base_agg) = self.base_agg_plan.clone() else { + return plan_err!("Base plan must be an Aggregate node"); + }; + + let key_count = self.partition_keys.len(); + let unkeyed_schema = Arc::new(build_df_schema_with_metadata( + &output_fields[key_count..], + self.base_agg_plan.schema().metadata().clone(), + )?); + + let unkeyed_agg_node = Aggregate::try_new_with_schema( + base_agg.input.clone(), + vec![], + base_agg.aggr_expr.clone(), + unkeyed_schema, + )?; + + let physical_agg = planner.sync_plan(&LogicalPlan::Aggregate(unkeyed_agg_node))?; + let physical_agg_proto = PhysicalPlanNode::try_from_physical_plan( + physical_agg, + &StreamingExtensionCodec::default(), + )?; + + let operator_config = SessionWindowAggregateOperator { + name: format!("session_window_{node_id}"), + gap_micros: gap.as_micros() as u64, + window_field_name: window_field.name().to_string(), + window_index: *window_index as u64, + input_schema: Some( + FsSchema::from_schema_keys( + Arc::new(input_schema.as_ref().into()), + self.partition_keys.clone(), + )? + .into(), + ), + unkeyed_aggregate_schema: None, + partial_aggregation_plan: vec![], + final_aggregation_plan: physical_agg_proto.encode_to_vec(), + }; + + Ok(LogicalNode::single( + node_id as u32, + format!("SessionWindow<{gap:?}>"), + OperatorName::SessionWindowAggregate, + operator_config.encode_to_vec(), + operator_config.name.clone(), + 1, + )) + } + + fn build_instant_operator( + &self, + planner: &Planner, + node_id: usize, + input_schema: DFSchemaRef, + apply_final_projection: bool, + ) -> Result { + let ts_column_expr = Expr::Column(Column::new_unqualified(TIMESTAMP_FIELD.to_string())); + let binning_expr = planner.create_physical_expr(&ts_column_expr, &input_schema)?; + let binning_proto = + serialize_physical_expr(&binning_expr, &DefaultPhysicalExtensionCodec {})?; + + let final_projection_payload = if apply_final_projection { + let physical_plan = planner.sync_plan(&self.post_aggregation_plan)?; + let proto_node = PhysicalPlanNode::try_from_physical_plan( + physical_plan, + &StreamingExtensionCodec::default(), + )?; + Some(proto_node.encode_to_vec()) + } else { + None + }; + + let SplitPlanOutput { + partial_aggregation_plan, + partial_schema, + finish_plan, + } = planner.split_physical_plan(self.partition_keys.clone(), &self.base_agg_plan, true)?; + + let operator_config = TumblingWindowAggregateOperator { + name: proto_operator_name::INSTANT_WINDOW.to_string(), + width_micros: 0, + binning_function: binning_proto.encode_to_vec(), + input_schema: Some( + FsSchema::from_schema_keys( + Arc::new(input_schema.as_ref().into()), + self.partition_keys.clone(), + )? + .into(), + ), + partial_schema: Some(partial_schema.into()), + partial_aggregation_plan: partial_aggregation_plan.encode_to_vec(), + final_aggregation_plan: finish_plan.encode_to_vec(), + final_projection: final_projection_payload, + }; + + Ok(LogicalNode::single( + node_id as u32, + format!("instant_window_{node_id}"), + OperatorName::TumblingWindowAggregate, + operator_config.encode_to_vec(), + proto_operator_name::INSTANT_WINDOW_LABEL.to_string(), + 1, + )) + } +} + +impl StreamingOperatorBlueprint for StreamWindowAggregateNode { + fn operator_identity(&self) -> Option { + None + } + + fn compile_to_graph_node( + &self, + planner: &Planner, + node_id: usize, + mut input_schemas: Vec, + ) -> Result { + if input_schemas.len() != 1 { + return plan_err!("StreamWindowAggregateNode requires exactly one input schema"); + } + + let raw_schema = input_schemas.remove(0); + let df_schema = Arc::new(DFSchema::try_from(raw_schema.schema.as_ref().clone())?); + + let logical_operator = match &self.window_spec { + WindowBehavior::FromOperator { + window, is_nested, .. + } => { + if *is_nested { + self.build_instant_operator(planner, node_id, df_schema, true)? + } else { + match window { + WindowType::Tumbling { width } => { + self.build_tumbling_operator(planner, node_id, df_schema, *width)? + } + WindowType::Sliding { width, slide } => self + .build_sliding_operator(planner, node_id, df_schema, *width, *slide)?, + WindowType::Session { .. } => { + self.build_session_operator(planner, node_id, df_schema)? + } + WindowType::Instant => { + return plan_err!( + "Instant window is invalid within standard operator context" + ); + } + } + } + } + WindowBehavior::InData => self + .build_instant_operator(planner, node_id, df_schema, false) + .map_err(|e| e.context("Failed compiling instant window"))?, + }; + + let link = LogicalEdge::project_all(LogicalEdgeType::Shuffle, (*raw_schema).clone()); + Ok(CompiledTopologyNode { + execution_unit: logical_operator, + routing_edges: vec![link], + }) + } + + fn yielded_schema(&self) -> FsSchema { + let schema_ref = (*self.output_schema).clone().into(); + FsSchema::from_schema_unkeyed(Arc::new(schema_ref)) + .expect("StreamWindowAggregateNode output schema must contain timestamp column") + } +} + +impl UserDefinedLogicalNodeCore for StreamWindowAggregateNode { + fn name(&self) -> &str { + STREAM_AGG_EXTENSION_NAME + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.base_agg_plan] + } + + fn schema(&self) -> &DFSchemaRef { + &self.output_schema + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result { + let spec_desc = match &self.window_spec { + WindowBehavior::InData => "InData".to_string(), + WindowBehavior::FromOperator { window, .. } => format!("FromOperator({window:?})"), + }; + write!( + f, + "StreamWindowAggregate: {} | spec: {}", + self.schema(), + spec_desc + ) + } + + fn with_exprs_and_inputs(&self, _exprs: Vec, inputs: Vec) -> Result { + if inputs.len() != 1 { + return internal_err!("StreamWindowAggregateNode expects exactly 1 input"); + } + Self::try_new( + self.window_spec.clone(), + inputs[0].clone(), + self.partition_keys.clone(), + ) + } +} + +// ----------------------------------------------------------------------------- +// Dedicated boundary math for window bin / post-aggregation projection +// ----------------------------------------------------------------------------- + +struct WindowBoundaryMath; + +impl WindowBoundaryMath { + fn interval_nanos(nanos: i64) -> Expr { + Expr::Literal( + ScalarValue::IntervalMonthDayNano(Some(IntervalMonthDayNanoType::make_value( + 0, 0, nanos, + ))), + None, + ) + } + + fn build_post_aggregation( + agg_plan: &LogicalPlan, + window_spec: WindowBehavior, + ) -> Result { + let ts_field: QualifiedField = agg_plan + .inputs() + .first() + .ok_or_else(|| DataFusionError::Plan("Aggregate has no inputs".into()))? + .schema() + .qualified_field_with_unqualified_name(TIMESTAMP_FIELD)? + .into(); + + let plan_with_ts = LogicalPlan::Extension(Extension { + node: Arc::new(SystemTimestampInjectorNode::try_new( + agg_plan.clone(), + ts_field.qualifier().cloned(), + )?), + }); + + let (win_field, win_index, duration, is_nested) = match window_spec { + WindowBehavior::InData => return Ok(plan_with_ts), + WindowBehavior::FromOperator { + window, + window_field, + window_index, + is_nested, + } => match window { + WindowType::Tumbling { width } | WindowType::Sliding { width, .. } => { + (window_field, window_index, width, is_nested) + } + WindowType::Session { .. } => { + return Ok(LogicalPlan::Extension(Extension { + node: Arc::new(InjectWindowFieldNode::try_new( + plan_with_ts, + window_field, + window_index, + )?), + })); + } + WindowType::Instant => return Ok(plan_with_ts), + }, + }; + + if is_nested { + return Self::build_nested_projection(plan_with_ts, win_field, win_index, duration); + } + + let mut output_fields = extract_qualified_fields(agg_plan.schema()); + let mut projections: Vec<_> = output_fields + .iter() + .map(|f| Expr::Column(f.qualified_column())) + .collect(); + + let ts_col_expr = Expr::Column(Column::new(ts_field.qualifier().cloned(), ts_field.name())); + + output_fields.insert(win_index, win_field.clone()); + + let win_func_expr = Expr::ScalarFunction(ScalarFunction { + func: window(), + args: vec![ + ts_col_expr.clone(), + Expr::BinaryExpr(BinaryExpr { + left: Box::new(ts_col_expr.clone()), + op: logical_expr::Operator::Plus, + right: Box::new(Self::interval_nanos(duration.as_nanos() as i64)), + }), + ], + }); + + projections.insert( + win_index, + win_func_expr.alias_qualified(win_field.qualifier().cloned(), win_field.name()), + ); + + output_fields.push(ts_field); + + let bin_end_expr = Expr::BinaryExpr(BinaryExpr { + left: Box::new(ts_col_expr), + op: logical_expr::Operator::Plus, + right: Box::new(Self::interval_nanos((duration.as_nanos() - 1) as i64)), + }); + projections.push(bin_end_expr); + + Ok(LogicalPlan::Projection( + logical_expr::Projection::try_new_with_schema( + projections, + Arc::new(plan_with_ts), + Arc::new(build_df_schema(&output_fields)?), + )?, + )) + } + + fn build_nested_projection( + plan: LogicalPlan, + win_field: QualifiedField, + win_index: usize, + duration: Duration, + ) -> Result { + let ts_field: QualifiedField = plan + .schema() + .qualified_field_with_unqualified_name(TIMESTAMP_FIELD)? + .into(); + let ts_col_expr = Expr::Column(Column::new(ts_field.qualifier().cloned(), ts_field.name())); + + let mut output_fields = extract_qualified_fields(plan.schema()); + let mut projections: Vec<_> = output_fields + .iter() + .map(|f| Expr::Column(f.qualified_column())) + .collect(); + + output_fields.insert(win_index, win_field.clone()); + + let win_func_expr = Expr::ScalarFunction(ScalarFunction { + func: window(), + args: vec![ + Expr::BinaryExpr(BinaryExpr { + left: Box::new(ts_col_expr.clone()), + op: logical_expr::Operator::Minus, + right: Box::new(Self::interval_nanos(duration.as_nanos() as i64 - 1)), + }), + Expr::BinaryExpr(BinaryExpr { + left: Box::new(ts_col_expr), + op: logical_expr::Operator::Plus, + right: Box::new(Self::interval_nanos(1)), + }), + ], + }); + + projections.insert( + win_index, + win_func_expr.alias_qualified(win_field.qualifier().cloned(), win_field.name()), + ); + + Ok(LogicalPlan::Projection( + logical_expr::Projection::try_new_with_schema( + projections, + Arc::new(plan), + Arc::new(build_df_schema(&output_fields)?), + )?, + )) + } +} + +// ----------------------------------------------------------------------------- +// Field injection node (session window column placement) +// ----------------------------------------------------------------------------- + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +struct InjectWindowFieldNode { + pub(crate) upstream_plan: LogicalPlan, + pub(crate) target_field: QualifiedField, + pub(crate) insertion_index: usize, + pub(crate) new_schema: DFSchemaRef, +} + +multifield_partial_ord!(InjectWindowFieldNode, upstream_plan, insertion_index); + +impl InjectWindowFieldNode { + fn try_new( + upstream_plan: LogicalPlan, + target_field: QualifiedField, + insertion_index: usize, + ) -> Result { + let mut fields = extract_qualified_fields(upstream_plan.schema()); + fields.insert(insertion_index, target_field.clone()); + let meta = upstream_plan.schema().metadata().clone(); + + Ok(Self { + upstream_plan, + target_field, + insertion_index, + new_schema: Arc::new(build_df_schema_with_metadata(&fields, meta)?), + }) + } +} + +impl UserDefinedLogicalNodeCore for InjectWindowFieldNode { + fn name(&self) -> &str { + "InjectWindowFieldNode" + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.upstream_plan] + } + + fn schema(&self) -> &DFSchemaRef { + &self.new_schema + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn fmt_for_explain(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!( + f, + "InjectWindowField: insert {:?} at offset {}", + self.target_field, self.insertion_index + ) + } + + fn with_exprs_and_inputs(&self, _exprs: Vec, inputs: Vec) -> Result { + if inputs.len() != 1 { + return internal_err!("InjectWindowFieldNode expects exactly 1 input"); + } + Self::try_new( + inputs[0].clone(), + self.target_field.clone(), + self.insertion_index, + ) + } +} diff --git a/src/sql/logical_node/async_udf.rs b/src/sql/logical_node/async_udf.rs new file mode 100644 index 00000000..6cd2da7b --- /dev/null +++ b/src/sql/logical_node/async_udf.rs @@ -0,0 +1,247 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Formatter; +use std::sync::Arc; +use std::time::Duration; + +use datafusion::common::{DFSchemaRef, Result}; +use datafusion::logical_expr::{ + Expr, LogicalPlan, UserDefinedLogicalNode, UserDefinedLogicalNodeCore, +}; +use datafusion_common::{internal_err, plan_err}; +use datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec; +use datafusion_proto::physical_plan::to_proto::serialize_physical_expr; +use prost::Message; +use protocol::function_stream_graph::{AsyncUdfOperator, AsyncUdfOrdering}; + +use crate::multifield_partial_ord; +use crate::sql::common::constants::extension_node; +use crate::sql::common::constants::sql_field; +use crate::sql::common::{FsSchema, FsSchemaRef}; +use crate::sql::logical_node::logical::{ + DylibUdfConfig, LogicalEdge, LogicalEdgeType, LogicalNode, OperatorName, +}; +use crate::sql::logical_node::streaming_operator_blueprint::{ + CompiledTopologyNode, StreamingOperatorBlueprint, +}; +use crate::sql::logical_planner::planner::{NamedNode, Planner}; +use crate::sql::types::{QualifiedField, build_df_schema, extract_qualified_fields}; + +pub(crate) const NODE_TYPE_NAME: &str = extension_node::ASYNC_FUNCTION_EXECUTION; + +/// Represents a logical node that executes an external asynchronous function (UDF) +/// and projects the final results into the streaming pipeline. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) struct AsyncFunctionExecutionNode { + pub(crate) upstream_plan: Arc, + pub(crate) operator_name: String, + pub(crate) function_config: DylibUdfConfig, + pub(crate) invocation_args: Vec, + pub(crate) result_projections: Vec, + pub(crate) preserve_ordering: bool, + pub(crate) concurrency_limit: usize, + pub(crate) execution_timeout: Duration, + pub(crate) resolved_schema: DFSchemaRef, +} + +multifield_partial_ord!( + AsyncFunctionExecutionNode, + upstream_plan, + operator_name, + function_config, + invocation_args, + result_projections, + preserve_ordering, + concurrency_limit, + execution_timeout +); + +impl AsyncFunctionExecutionNode { + /// Compiles logical expressions into serialized physical protobuf bytes. + fn compile_physical_expressions( + &self, + planner: &Planner, + expressions: &[Expr], + schema_context: &DFSchemaRef, + ) -> Result>> { + expressions + .iter() + .map(|logical_expr| { + let physical_expr = planner.create_physical_expr(logical_expr, schema_context)?; + let serialized = + serialize_physical_expr(&physical_expr, &DefaultPhysicalExtensionCodec {})?; + Ok(serialized.encode_to_vec()) + }) + .collect() + } + + /// Computes the intermediate schema which bridges the upstream output + /// and the raw asynchronous result injected by the UDF execution. + fn compute_intermediate_schema(&self) -> Result { + let mut fields = extract_qualified_fields(self.upstream_plan.schema()); + + let raw_result_field = QualifiedField::new( + None, + sql_field::ASYNC_RESULT, + self.function_config.return_type.clone(), + true, + ); + fields.push(raw_result_field); + + Ok(Arc::new(build_df_schema(&fields)?)) + } + + fn to_protobuf_config( + &self, + compiled_args: Vec>, + compiled_projections: Vec>, + ) -> AsyncUdfOperator { + let ordering_strategy = if self.preserve_ordering { + AsyncUdfOrdering::Ordered + } else { + AsyncUdfOrdering::Unordered + }; + + AsyncUdfOperator { + name: self.operator_name.clone(), + udf: Some(self.function_config.clone().into()), + arg_exprs: compiled_args, + final_exprs: compiled_projections, + ordering: ordering_strategy as i32, + max_concurrency: self.concurrency_limit as u32, + timeout_micros: self.execution_timeout.as_micros() as u64, + } + } +} + +impl StreamingOperatorBlueprint for AsyncFunctionExecutionNode { + fn operator_identity(&self) -> Option { + None + } + + fn compile_to_graph_node( + &self, + planner: &Planner, + node_index: usize, + mut input_schemas: Vec, + ) -> Result { + if input_schemas.len() != 1 { + return plan_err!("AsyncFunctionExecutionNode requires exactly one input schema"); + } + + let compiled_args = self.compile_physical_expressions( + planner, + &self.invocation_args, + self.upstream_plan.schema(), + )?; + + let intermediate_schema = self.compute_intermediate_schema()?; + let compiled_projections = self.compile_physical_expressions( + planner, + &self.result_projections, + &intermediate_schema, + )?; + + let operator_config = self.to_protobuf_config(compiled_args, compiled_projections); + + let logical_node = LogicalNode::single( + node_index as u32, + format!("async_udf_{node_index}"), + OperatorName::AsyncUdf, + operator_config.encode_to_vec(), + format!("AsyncUdf<{}>", self.operator_name), + 1, + ); + + let upstream_schema = input_schemas.remove(0); + let data_edge = + LogicalEdge::project_all(LogicalEdgeType::Forward, (*upstream_schema).clone()); + + Ok(CompiledTopologyNode { + execution_unit: logical_node, + routing_edges: vec![data_edge], + }) + } + + fn yielded_schema(&self) -> FsSchema { + let arrow_fields: Vec<_> = self + .resolved_schema + .fields() + .iter() + .map(|f| (**f).clone()) + .collect(); + + FsSchema::from_fields(arrow_fields) + } +} + +impl UserDefinedLogicalNodeCore for AsyncFunctionExecutionNode { + fn name(&self) -> &str { + NODE_TYPE_NAME + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.upstream_plan] + } + + fn schema(&self) -> &DFSchemaRef { + &self.resolved_schema + } + + fn expressions(&self) -> Vec { + self.invocation_args + .iter() + .chain(self.result_projections.iter()) + .cloned() + .collect() + } + + fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result { + write!( + f, + "AsyncFunctionExecution<{}>: Concurrency={}, Ordered={}", + self.operator_name, self.concurrency_limit, self.preserve_ordering + ) + } + + fn with_exprs_and_inputs( + &self, + exprs: Vec, + mut inputs: Vec, + ) -> Result { + if inputs.len() != 1 { + return internal_err!( + "AsyncFunctionExecutionNode expects exactly 1 input, but received {}", + inputs.len() + ); + } + + if UserDefinedLogicalNode::expressions(self) != exprs { + return internal_err!( + "Attempted to mutate async UDF expressions during logical planning, which is not supported." + ); + } + + Ok(Self { + upstream_plan: Arc::new(inputs.remove(0)), + operator_name: self.operator_name.clone(), + function_config: self.function_config.clone(), + invocation_args: self.invocation_args.clone(), + result_projections: self.result_projections.clone(), + preserve_ordering: self.preserve_ordering, + concurrency_limit: self.concurrency_limit, + execution_timeout: self.execution_timeout, + resolved_schema: self.resolved_schema.clone(), + }) + } +} diff --git a/src/sql/logical_node/debezium.rs b/src/sql/logical_node/debezium.rs new file mode 100644 index 00000000..8d69c6ec --- /dev/null +++ b/src/sql/logical_node/debezium.rs @@ -0,0 +1,393 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use arrow_schema::{DataType, Field, Schema}; +use datafusion::common::{ + DFSchema, DFSchemaRef, DataFusionError, Result, TableReference, internal_err, plan_err, +}; +use datafusion::logical_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore}; +use datafusion::physical_plan::DisplayAs; + +use crate::multifield_partial_ord; +use crate::sql::common::constants::{cdc, extension_node}; +use crate::sql::common::{FsSchema, FsSchemaRef, UPDATING_META_FIELD}; +use crate::sql::logical_planner::planner::{NamedNode, Planner}; +use crate::sql::physical::updating_meta_field; +use crate::sql::types::TIMESTAMP_FIELD; + +use super::{CompiledTopologyNode, StreamingOperatorBlueprint}; + +// ----------------------------------------------------------------------------- +// Constants & Identifiers +// ----------------------------------------------------------------------------- + +pub(crate) const UNROLL_NODE_NAME: &str = extension_node::UNROLL_DEBEZIUM_PAYLOAD; +pub(crate) const PACK_NODE_NAME: &str = extension_node::PACK_DEBEZIUM_ENVELOPE; + +// ----------------------------------------------------------------------------- +// Core Schema Codec +// ----------------------------------------------------------------------------- + +/// Transforms between flat schemas and Debezium CDC envelopes. +pub(crate) struct DebeziumSchemaCodec; + +impl DebeziumSchemaCodec { + /// Wraps a flat physical schema into a Debezium CDC envelope structure. + pub(crate) fn wrap_into_envelope( + flat_schema: &DFSchemaRef, + qualifier_override: Option, + ) -> Result { + let ts_field = if flat_schema.has_column_with_unqualified_name(TIMESTAMP_FIELD) { + Some( + flat_schema + .field_with_unqualified_name(TIMESTAMP_FIELD)? + .clone(), + ) + } else { + None + }; + + let payload_fields: Vec<_> = flat_schema + .fields() + .iter() + .filter(|f| f.name() != TIMESTAMP_FIELD && f.name() != UPDATING_META_FIELD) + .cloned() + .collect(); + + let payload_struct_type = DataType::Struct(payload_fields.into()); + + let mut envelope_fields = vec![ + Arc::new(Field::new(cdc::BEFORE, payload_struct_type.clone(), true)), + Arc::new(Field::new(cdc::AFTER, payload_struct_type, true)), + Arc::new(Field::new(cdc::OP, DataType::Utf8, true)), + ]; + + if let Some(ts) = ts_field { + envelope_fields.push(Arc::new(ts)); + } + + let arrow_schema = Schema::new(envelope_fields); + let final_schema = match qualifier_override { + Some(qualifier) => DFSchema::try_from_qualified_schema(qualifier, &arrow_schema)?, + None => DFSchema::try_from(arrow_schema)?, + }; + + Ok(Arc::new(final_schema)) + } +} + +// ----------------------------------------------------------------------------- +// Logical Node: Unroll Debezium Payload +// ----------------------------------------------------------------------------- + +/// Decodes an incoming Debezium envelope into a flat, updating stream representation. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct UnrollDebeziumPayloadNode { + upstream_plan: LogicalPlan, + resolved_schema: DFSchemaRef, + pub pk_indices: Vec, + pk_names: Arc>, +} + +multifield_partial_ord!( + UnrollDebeziumPayloadNode, + upstream_plan, + pk_indices, + pk_names +); + +impl UnrollDebeziumPayloadNode { + pub fn try_new(upstream_plan: LogicalPlan, pk_names: Arc>) -> Result { + let input_schema = upstream_plan.schema(); + + let (before_idx, after_idx) = Self::validate_envelope_structure(input_schema)?; + + let payload_fields = Self::extract_payload_fields(input_schema, before_idx)?; + + let pk_indices = Self::map_primary_keys(payload_fields, &pk_names)?; + + let qualifier = Self::resolve_schema_qualifier(input_schema, before_idx, after_idx)?; + + let resolved_schema = + Self::compile_unrolled_schema(input_schema, payload_fields, qualifier)?; + + Ok(Self { + upstream_plan, + resolved_schema, + pk_indices, + pk_names, + }) + } + + fn validate_envelope_structure(schema: &DFSchemaRef) -> Result<(usize, usize)> { + let before_idx = schema + .index_of_column_by_name(None, cdc::BEFORE) + .ok_or_else(|| { + DataFusionError::Plan("Missing 'before' state column in CDC stream".into()) + })?; + + let after_idx = schema + .index_of_column_by_name(None, cdc::AFTER) + .ok_or_else(|| { + DataFusionError::Plan("Missing 'after' state column in CDC stream".into()) + })?; + + let op_idx = schema + .index_of_column_by_name(None, cdc::OP) + .ok_or_else(|| { + DataFusionError::Plan("Missing 'op' operation column in CDC stream".into()) + })?; + + let before_type = schema.field(before_idx).data_type(); + let after_type = schema.field(after_idx).data_type(); + + if before_type != after_type { + return plan_err!( + "State column type mismatch: 'before' is {before_type}, but 'after' is {after_type}" + ); + } + + if *schema.field(op_idx).data_type() != DataType::Utf8 { + return plan_err!("The '{}' column must be of type Utf8", cdc::OP); + } + + Ok((before_idx, after_idx)) + } + + fn extract_payload_fields( + schema: &DFSchemaRef, + state_idx: usize, + ) -> Result<&arrow_schema::Fields> { + match schema.field(state_idx).data_type() { + DataType::Struct(fields) => Ok(fields), + other => plan_err!("State columns must be of type Struct, found {other}"), + } + } + + fn map_primary_keys(fields: &arrow_schema::Fields, pk_names: &[String]) -> Result> { + pk_names + .iter() + .map(|pk| fields.find(pk).map(|(idx, _)| idx)) + .collect::>>() + .ok_or_else(|| { + DataFusionError::Plan("Specified primary key not found in payload schema".into()) + }) + } + + fn resolve_schema_qualifier( + schema: &DFSchemaRef, + before_idx: usize, + after_idx: usize, + ) -> Result> { + let before_qualifier = schema.qualified_field(before_idx).0; + let after_qualifier = schema.qualified_field(after_idx).0; + + match (before_qualifier, after_qualifier) { + (Some(bq), Some(aq)) if bq == aq => Ok(Some(bq.clone())), + (None, None) => Ok(None), + _ => plan_err!("'before' and 'after' columns must share the same namespace/qualifier"), + } + } + + fn compile_unrolled_schema( + original_schema: &DFSchemaRef, + payload_fields: &arrow_schema::Fields, + qualifier: Option, + ) -> Result { + let mut flat_fields = payload_fields.to_vec(); + + flat_fields.push(updating_meta_field()); + + let ts_idx = original_schema + .index_of_column_by_name(None, TIMESTAMP_FIELD) + .ok_or_else(|| { + DataFusionError::Plan(format!( + "Required event time field '{TIMESTAMP_FIELD}' is missing" + )) + })?; + + flat_fields.push(Arc::new(original_schema.field(ts_idx).clone())); + + let arrow_schema = Schema::new(flat_fields); + let compiled_schema = match qualifier { + Some(q) => DFSchema::try_from_qualified_schema(q, &arrow_schema)?, + None => DFSchema::try_from(arrow_schema)?, + }; + + Ok(Arc::new(compiled_schema)) + } +} + +impl UserDefinedLogicalNodeCore for UnrollDebeziumPayloadNode { + fn name(&self) -> &str { + UNROLL_NODE_NAME + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.upstream_plan] + } + + fn schema(&self) -> &DFSchemaRef { + &self.resolved_schema + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn fmt_for_explain(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "UnrollDebeziumPayload") + } + + fn with_exprs_and_inputs( + &self, + _exprs: Vec, + mut inputs: Vec, + ) -> Result { + if inputs.len() != 1 { + return internal_err!( + "UnrollDebeziumPayloadNode expects exactly 1 input, got {}", + inputs.len() + ); + } + Self::try_new(inputs.remove(0), self.pk_names.clone()) + } +} + +impl StreamingOperatorBlueprint for UnrollDebeziumPayloadNode { + fn operator_identity(&self) -> Option { + None + } + + fn is_passthrough_boundary(&self) -> bool { + true + } + + fn compile_to_graph_node( + &self, + _: &Planner, + _: usize, + _: Vec, + ) -> Result { + plan_err!( + "UnrollDebeziumPayloadNode is a logical boundary and should not be physically planned" + ) + } + + fn yielded_schema(&self) -> FsSchema { + FsSchema::from_schema_unkeyed(Arc::new(self.resolved_schema.as_ref().into())) + .unwrap_or_else(|_| { + panic!("Failed to extract physical schema for {}", UNROLL_NODE_NAME) + }) + } +} + +// ----------------------------------------------------------------------------- +// Logical Node: Pack Debezium Envelope +// ----------------------------------------------------------------------------- + +/// Encodes a flat updating stream back into a Debezium CDC envelope representation. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) struct PackDebeziumEnvelopeNode { + upstream_plan: Arc, + envelope_schema: DFSchemaRef, +} + +multifield_partial_ord!(PackDebeziumEnvelopeNode, upstream_plan); + +impl PackDebeziumEnvelopeNode { + pub(crate) fn try_new(upstream_plan: LogicalPlan) -> Result { + let envelope_schema = DebeziumSchemaCodec::wrap_into_envelope(upstream_plan.schema(), None) + .map_err(|e| { + DataFusionError::Plan(format!("Failed to compile Debezium envelope schema: {e}")) + })?; + + Ok(Self { + upstream_plan: Arc::new(upstream_plan), + envelope_schema, + }) + } +} + +impl DisplayAs for PackDebeziumEnvelopeNode { + fn fmt_as( + &self, + _t: datafusion::physical_plan::DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + write!(f, "PackDebeziumEnvelope") + } +} + +impl UserDefinedLogicalNodeCore for PackDebeziumEnvelopeNode { + fn name(&self) -> &str { + PACK_NODE_NAME + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.upstream_plan] + } + + fn schema(&self) -> &DFSchemaRef { + &self.envelope_schema + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn fmt_for_explain(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "PackDebeziumEnvelope") + } + + fn with_exprs_and_inputs( + &self, + _exprs: Vec, + mut inputs: Vec, + ) -> Result { + if inputs.len() != 1 { + return internal_err!( + "PackDebeziumEnvelopeNode expects exactly 1 input, got {}", + inputs.len() + ); + } + Self::try_new(inputs.remove(0)) + } +} + +impl StreamingOperatorBlueprint for PackDebeziumEnvelopeNode { + fn operator_identity(&self) -> Option { + None + } + + fn is_passthrough_boundary(&self) -> bool { + true + } + + fn compile_to_graph_node( + &self, + _: &Planner, + _: usize, + _: Vec, + ) -> Result { + internal_err!( + "PackDebeziumEnvelopeNode is a logical boundary and should not be physically planned" + ) + } + + fn yielded_schema(&self) -> FsSchema { + FsSchema::from_schema_unkeyed(Arc::new(self.envelope_schema.as_ref().into())) + .unwrap_or_else(|_| panic!("Failed to extract physical schema for {}", PACK_NODE_NAME)) + } +} diff --git a/src/sql/logical_node/extension_try_from.rs b/src/sql/logical_node/extension_try_from.rs new file mode 100644 index 00000000..32b12d6c --- /dev/null +++ b/src/sql/logical_node/extension_try_from.rs @@ -0,0 +1,70 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use datafusion::common::{DataFusionError, Result}; +use datafusion::logical_expr::UserDefinedLogicalNode; + +use crate::sql::logical_node::aggregate::StreamWindowAggregateNode; +use crate::sql::logical_node::async_udf::AsyncFunctionExecutionNode; +use crate::sql::logical_node::debezium::{PackDebeziumEnvelopeNode, UnrollDebeziumPayloadNode}; +use crate::sql::logical_node::join::StreamingJoinNode; +use crate::sql::logical_node::key_calculation::KeyExtractionNode; +use crate::sql::logical_node::lookup::StreamReferenceJoinNode; +use crate::sql::logical_node::projection::StreamProjectionNode; +use crate::sql::logical_node::remote_table::RemoteTableBoundaryNode; +use crate::sql::logical_node::sink::StreamEgressNode; +use crate::sql::logical_node::streaming_operator_blueprint::StreamingOperatorBlueprint; +use crate::sql::logical_node::table_source::StreamIngestionNode; +use crate::sql::logical_node::updating_aggregate::ContinuousAggregateNode; +use crate::sql::logical_node::watermark_node::EventTimeWatermarkNode; +use crate::sql::logical_node::windows_function::StreamingWindowFunctionNode; + +fn try_from_t( + node: &dyn UserDefinedLogicalNode, +) -> std::result::Result<&dyn StreamingOperatorBlueprint, ()> { + node.as_any() + .downcast_ref::() + .map(|t| t as &dyn StreamingOperatorBlueprint) + .ok_or(()) +} + +impl<'a> TryFrom<&'a dyn UserDefinedLogicalNode> for &'a dyn StreamingOperatorBlueprint { + type Error = DataFusionError; + + fn try_from(node: &'a dyn UserDefinedLogicalNode) -> Result { + try_from_t::(node) + .or_else(|_| try_from_t::(node)) + .or_else(|_| try_from_t::(node)) + .or_else(|_| try_from_t::(node)) + .or_else(|_| try_from_t::(node)) + .or_else(|_| try_from_t::(node)) + .or_else(|_| try_from_t::(node)) + .or_else(|_| try_from_t::(node)) + .or_else(|_| try_from_t::(node)) + .or_else(|_| try_from_t::(node)) + .or_else(|_| try_from_t::(node)) + .or_else(|_| try_from_t::(node)) + .or_else(|_| try_from_t::(node)) + .or_else(|_| try_from_t::(node)) + .map_err(|_| DataFusionError::Plan(format!("unexpected node: {}", node.name()))) + } +} + +impl<'a> TryFrom<&'a Arc> for &'a dyn StreamingOperatorBlueprint { + type Error = DataFusionError; + + fn try_from(node: &'a Arc) -> Result { + TryFrom::try_from(node.as_ref()) + } +} diff --git a/src/sql/logical_node/is_retract.rs b/src/sql/logical_node/is_retract.rs new file mode 100644 index 00000000..4370f6ae --- /dev/null +++ b/src/sql/logical_node/is_retract.rs @@ -0,0 +1,82 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use datafusion::arrow::datatypes::{DataType, TimeUnit}; +use datafusion::common::{DFSchemaRef, Result, TableReference}; +use datafusion::logical_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore}; + +use crate::multifield_partial_ord; +use crate::sql::physical::updating_meta_field; +use crate::sql::types::{ + QualifiedField, TIMESTAMP_FIELD, build_df_schema, extract_qualified_fields, +}; + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) struct IsRetractExtension { + pub(crate) input: LogicalPlan, + pub(crate) schema: DFSchemaRef, + pub(crate) timestamp_qualifier: Option, +} + +multifield_partial_ord!(IsRetractExtension, input, timestamp_qualifier); + +impl IsRetractExtension { + pub(crate) fn new(input: LogicalPlan, timestamp_qualifier: Option) -> Self { + let mut output_fields = extract_qualified_fields(input.schema()); + + let timestamp_index = output_fields.len() - 1; + output_fields[timestamp_index] = QualifiedField::new( + timestamp_qualifier.clone(), + TIMESTAMP_FIELD, + DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + ); + output_fields.push((timestamp_qualifier.clone(), updating_meta_field()).into()); + let schema = Arc::new(build_df_schema(&output_fields).unwrap()); + Self { + input, + schema, + timestamp_qualifier, + } + } +} + +impl UserDefinedLogicalNodeCore for IsRetractExtension { + fn name(&self) -> &str { + "IsRetractExtension" + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.input] + } + + fn schema(&self) -> &DFSchemaRef { + &self.schema + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn fmt_for_explain(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "IsRetractExtension") + } + + fn with_exprs_and_inputs(&self, _exprs: Vec, inputs: Vec) -> Result { + Ok(Self::new( + inputs[0].clone(), + self.timestamp_qualifier.clone(), + )) + } +} diff --git a/src/sql/logical_node/join.rs b/src/sql/logical_node/join.rs new file mode 100644 index 00000000..ea142d0a --- /dev/null +++ b/src/sql/logical_node/join.rs @@ -0,0 +1,211 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Formatter; +use std::time::Duration; + +use datafusion::common::{DFSchemaRef, Result}; +use datafusion::logical_expr::expr::Expr; +use datafusion::logical_expr::{LogicalPlan, UserDefinedLogicalNodeCore}; +use datafusion_common::plan_err; +use datafusion_proto::physical_plan::AsExecutionPlan; +use datafusion_proto::protobuf::PhysicalPlanNode; +use prost::Message; +use protocol::function_stream_graph::JoinOperator; + +use crate::sql::common::constants::{extension_node, runtime_operator_kind}; +use crate::sql::common::{FsSchema, FsSchemaRef}; +use crate::sql::logical_node::logical::{LogicalEdge, LogicalEdgeType, LogicalNode, OperatorName}; +use crate::sql::logical_node::{CompiledTopologyNode, StreamingOperatorBlueprint}; +use crate::sql::logical_planner::planner::{NamedNode, Planner}; +use crate::sql::physical::StreamingExtensionCodec; + +// ----------------------------------------------------------------------------- +// Constants +// ----------------------------------------------------------------------------- + +pub(crate) const STREAM_JOIN_NODE_TYPE: &str = extension_node::STREAMING_JOIN; + +// ----------------------------------------------------------------------------- +// Logical Node Definition +// ----------------------------------------------------------------------------- + +/// A logical plan node representing a streaming join operation. +/// It bridges the DataFusion logical plan with the physical streaming execution engine. +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd)] +pub struct StreamingJoinNode { + pub(crate) underlying_plan: LogicalPlan, + pub(crate) instant_execution_mode: bool, + pub(crate) state_retention_ttl: Option, +} + +impl StreamingJoinNode { + /// Creates a new instance of the streaming join node. + pub fn new( + underlying_plan: LogicalPlan, + instant_execution_mode: bool, + state_retention_ttl: Option, + ) -> Self { + Self { + underlying_plan, + instant_execution_mode, + state_retention_ttl, + } + } + + /// Compiles the physical execution plan and serializes it into a Protobuf configuration payload. + fn compile_operator_config( + &self, + planner: &Planner, + node_identifier: &str, + left_schema: FsSchemaRef, + right_schema: FsSchemaRef, + ) -> Result { + let physical_plan = planner.sync_plan(&self.underlying_plan)?; + + let proto_node = PhysicalPlanNode::try_from_physical_plan( + physical_plan, + &StreamingExtensionCodec::default(), + )?; + + Ok(JoinOperator { + name: node_identifier.to_string(), + left_schema: Some(left_schema.as_ref().clone().into()), + right_schema: Some(right_schema.as_ref().clone().into()), + output_schema: Some(self.extract_fs_schema().into()), + join_plan: proto_node.encode_to_vec(), + ttl_micros: self.state_retention_ttl.map(|ttl| ttl.as_micros() as u64), + }) + } + + fn determine_operator_type(&self) -> OperatorName { + if self.instant_execution_mode { + OperatorName::InstantJoin + } else { + OperatorName::Join + } + } + + fn extract_fs_schema(&self) -> FsSchema { + FsSchema::from_schema_unkeyed(self.underlying_plan.schema().inner().clone()) + .expect("Fatal: Failed to convert internal join schema to FsSchema without keys") + } +} + +// ----------------------------------------------------------------------------- +// DataFusion Logical Node Core Implementation +// ----------------------------------------------------------------------------- + +impl UserDefinedLogicalNodeCore for StreamingJoinNode { + fn name(&self) -> &str { + STREAM_JOIN_NODE_TYPE + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.underlying_plan] + } + + fn schema(&self) -> &DFSchemaRef { + self.underlying_plan.schema() + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result { + write!( + f, + "StreamingJoinNode: Schema={}, InstantMode={}, TTL={:?}", + self.schema(), + self.instant_execution_mode, + self.state_retention_ttl + ) + } + + fn with_exprs_and_inputs( + &self, + _exprs: Vec, + mut inputs: Vec, + ) -> Result { + if inputs.len() != 1 { + return plan_err!( + "StreamingJoinNode expects exactly 1 underlying logical plan during recreation" + ); + } + + Ok(Self::new( + inputs.remove(0), + self.instant_execution_mode, + self.state_retention_ttl, + )) + } +} + +// ----------------------------------------------------------------------------- +// Streaming Graph Extension Implementation +// ----------------------------------------------------------------------------- + +impl StreamingOperatorBlueprint for StreamingJoinNode { + fn operator_identity(&self) -> Option { + None + } + + fn compile_to_graph_node( + &self, + planner: &Planner, + node_index: usize, + mut input_schemas: Vec, + ) -> Result { + if input_schemas.len() != 2 { + return plan_err!( + "Invalid topology: StreamingJoinNode requires exactly two upstream inputs, received {}", + input_schemas.len() + ); + } + + let right_schema = input_schemas.pop().unwrap(); + let left_schema = input_schemas.pop().unwrap(); + + let node_identifier = format!("stream_join_{node_index}"); + + let operator_config = self.compile_operator_config( + planner, + &node_identifier, + left_schema.clone(), + right_schema.clone(), + )?; + + let logical_node = LogicalNode::single( + node_index as u32, + node_identifier.clone(), + self.determine_operator_type(), + operator_config.encode_to_vec(), + runtime_operator_kind::STREAMING_JOIN.to_string(), + 1, + ); + + let left_edge = + LogicalEdge::project_all(LogicalEdgeType::LeftJoin, left_schema.as_ref().clone()); + let right_edge = + LogicalEdge::project_all(LogicalEdgeType::RightJoin, right_schema.as_ref().clone()); + + Ok(CompiledTopologyNode { + execution_unit: logical_node, + routing_edges: vec![left_edge, right_edge], + }) + } + + fn yielded_schema(&self) -> FsSchema { + self.extract_fs_schema() + } +} diff --git a/src/sql/logical_node/key_calculation.rs b/src/sql/logical_node/key_calculation.rs new file mode 100644 index 00000000..6bcad784 --- /dev/null +++ b/src/sql/logical_node/key_calculation.rs @@ -0,0 +1,309 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Formatter; +use std::sync::Arc; + +use datafusion::arrow::datatypes::{Field, Schema}; +use datafusion::common::{DFSchemaRef, Result, internal_err, plan_err}; +use datafusion::logical_expr::{Expr, ExprSchemable, LogicalPlan, UserDefinedLogicalNodeCore}; +use datafusion_common::DFSchema; +use datafusion_expr::col; +use datafusion_proto::physical_plan::to_proto::serialize_physical_expr; +use datafusion_proto::physical_plan::{AsExecutionPlan, DefaultPhysicalExtensionCodec}; +use datafusion_proto::protobuf::PhysicalPlanNode; +use itertools::Itertools; +use prost::Message; + +use protocol::function_stream_graph::{KeyPlanOperator, ProjectionOperator}; + +use crate::multifield_partial_ord; +use crate::sql::common::constants::{extension_node, sql_field}; +use crate::sql::common::{FsSchema, FsSchemaRef}; +use crate::sql::logical_node::logical::{LogicalEdge, LogicalEdgeType, LogicalNode, OperatorName}; +use crate::sql::logical_node::{CompiledTopologyNode, StreamingOperatorBlueprint}; +use crate::sql::logical_planner::planner::{NamedNode, Planner}; +use crate::sql::physical::StreamingExtensionCodec; +use crate::sql::types::{build_df_schema_with_metadata, extract_qualified_fields}; + +pub(crate) const EXTENSION_NODE_IDENTIFIER: &str = extension_node::KEY_EXTRACTION; + +/// Routing strategy for shuffling data across the stream topology. +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd)] +pub enum KeyExtractionStrategy { + ColumnIndices(Vec), + CalculatedExpressions(Vec), +} + +/// Logical node that computes or extracts routing keys. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) struct KeyExtractionNode { + pub(crate) operator_label: Option, + pub(crate) upstream_plan: LogicalPlan, + pub(crate) extraction_strategy: KeyExtractionStrategy, + pub(crate) resolved_schema: DFSchemaRef, +} + +multifield_partial_ord!( + KeyExtractionNode, + operator_label, + upstream_plan, + extraction_strategy +); + +impl KeyExtractionNode { + /// Extracts keys and hides them from the downstream projection. + pub fn try_new_with_projection( + upstream_plan: LogicalPlan, + target_indices: Vec, + label: String, + ) -> Result { + let projected_fields: Vec<_> = extract_qualified_fields(upstream_plan.schema()) + .into_iter() + .enumerate() + .filter(|(idx, _)| !target_indices.contains(idx)) + .map(|(_, field)| field) + .collect(); + + let metadata = upstream_plan.schema().metadata().clone(); + let resolved_schema = build_df_schema_with_metadata(&projected_fields, metadata)?; + + Ok(Self { + operator_label: Some(label), + upstream_plan, + extraction_strategy: KeyExtractionStrategy::ColumnIndices(target_indices), + resolved_schema: Arc::new(resolved_schema), + }) + } + + /// Creates a node using an explicit strategy without changing the visible schema. + pub fn new(upstream_plan: LogicalPlan, strategy: KeyExtractionStrategy) -> Self { + let resolved_schema = upstream_plan.schema().clone(); + Self { + operator_label: None, + upstream_plan, + extraction_strategy: strategy, + resolved_schema, + } + } + + fn compile_index_router( + &self, + physical_plan_proto: PhysicalPlanNode, + indices: &[usize], + ) -> (Vec, OperatorName) { + let operator_config = KeyPlanOperator { + name: sql_field::DEFAULT_KEY_LABEL.into(), + physical_plan: physical_plan_proto.encode_to_vec(), + key_fields: indices.iter().map(|&idx| idx as u64).collect(), + }; + + (operator_config.encode_to_vec(), OperatorName::KeyBy) + } + + fn compile_expression_router( + &self, + planner: &Planner, + expressions: &[Expr], + input_schema_ref: &FsSchemaRef, + input_df_schema: &DFSchemaRef, + ) -> Result<(Vec, OperatorName)> { + let mut target_exprs = expressions.to_vec(); + + for field in input_schema_ref.schema.fields.iter() { + target_exprs.push(col(field.name())); + } + + let output_fs_schema = self.generate_fs_schema()?; + + for (compiled_expr, expected_field) in + target_exprs.iter().zip(output_fs_schema.schema.fields()) + { + let (expr_type, expr_nullable) = + compiled_expr.data_type_and_nullable(input_df_schema)?; + if expr_type != *expected_field.data_type() + || expr_nullable != expected_field.is_nullable() + { + return plan_err!( + "Type mismatch in key calculation: Expected {} (nullable: {}), got {} (nullable: {})", + expected_field.data_type(), + expected_field.is_nullable(), + expr_type, + expr_nullable + ); + } + } + + let mut physical_expr_payloads = Vec::with_capacity(target_exprs.len()); + for logical_expr in target_exprs { + let physical_expr = planner + .create_physical_expr(&logical_expr, input_df_schema) + .map_err(|e| e.context("Failed to physicalize PARTITION BY expression"))?; + + let serialized_expr = + serialize_physical_expr(&physical_expr, &DefaultPhysicalExtensionCodec {})?; + physical_expr_payloads.push(serialized_expr.encode_to_vec()); + } + + let operator_config = ProjectionOperator { + name: self + .operator_label + .as_deref() + .unwrap_or(sql_field::DEFAULT_KEY_LABEL) + .to_string(), + input_schema: Some(input_schema_ref.as_ref().clone().into()), + output_schema: Some(output_fs_schema.into()), + exprs: physical_expr_payloads, + }; + + Ok((operator_config.encode_to_vec(), OperatorName::Projection)) + } + + fn generate_fs_schema(&self) -> Result { + let base_arrow_schema = self.upstream_plan.schema().as_ref(); + + match &self.extraction_strategy { + KeyExtractionStrategy::ColumnIndices(indices) => { + FsSchema::from_schema_keys(Arc::new(base_arrow_schema.into()), indices.clone()) + } + KeyExtractionStrategy::CalculatedExpressions(expressions) => { + let mut composite_fields = + Vec::with_capacity(expressions.len() + base_arrow_schema.fields().len()); + + for (idx, expr) in expressions.iter().enumerate() { + let (data_type, nullable) = expr.data_type_and_nullable(base_arrow_schema)?; + composite_fields + .push(Field::new(format!("__key_{idx}"), data_type, nullable).into()); + } + + for field in base_arrow_schema.fields().iter() { + composite_fields.push(field.clone()); + } + + let final_schema = Arc::new(Schema::new(composite_fields)); + let key_mapping = (1..=expressions.len()).collect_vec(); + FsSchema::from_schema_keys(final_schema, key_mapping) + } + } + } +} + +impl StreamingOperatorBlueprint for KeyExtractionNode { + fn operator_identity(&self) -> Option { + None + } + + fn compile_to_graph_node( + &self, + planner: &Planner, + node_index: usize, + mut input_schemas: Vec, + ) -> Result { + if input_schemas.len() != 1 { + return plan_err!("KeyExtractionNode requires exactly one upstream input schema"); + } + + let input_schema_ref = input_schemas.remove(0); + let input_df_schema = Arc::new(DFSchema::try_from( + input_schema_ref.schema.as_ref().clone(), + )?); + + let physical_plan = planner.sync_plan(&self.upstream_plan)?; + let physical_plan_proto = PhysicalPlanNode::try_from_physical_plan( + physical_plan, + &StreamingExtensionCodec::default(), + )?; + + let (protobuf_payload, engine_operator_name) = match &self.extraction_strategy { + KeyExtractionStrategy::ColumnIndices(indices) => { + self.compile_index_router(physical_plan_proto, indices) + } + KeyExtractionStrategy::CalculatedExpressions(exprs) => { + self.compile_expression_router(planner, exprs, &input_schema_ref, &input_df_schema)? + } + }; + + let logical_node = LogicalNode::single( + node_index as u32, + format!("key_{node_index}"), + engine_operator_name, + protobuf_payload, + format!("Key<{}>", self.operator_label.as_deref().unwrap_or("_")), + 1, + ); + + let data_edge = + LogicalEdge::project_all(LogicalEdgeType::Forward, (*input_schema_ref).clone()); + + Ok(CompiledTopologyNode { + execution_unit: logical_node, + routing_edges: vec![data_edge], + }) + } + + fn yielded_schema(&self) -> FsSchema { + self.generate_fs_schema() + .expect("Fatal: Failed to generate output schema for KeyExtractionNode") + } +} + +impl UserDefinedLogicalNodeCore for KeyExtractionNode { + fn name(&self) -> &str { + EXTENSION_NODE_IDENTIFIER + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.upstream_plan] + } + + fn schema(&self) -> &DFSchemaRef { + &self.resolved_schema + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result { + write!( + f, + "KeyExtractionNode: Strategy={:?} | Schema={}", + self.extraction_strategy, self.resolved_schema + ) + } + + fn with_exprs_and_inputs( + &self, + exprs: Vec, + mut inputs: Vec, + ) -> Result { + if inputs.len() != 1 { + return internal_err!("KeyExtractionNode requires exactly 1 input logical plan"); + } + + let strategy = match &self.extraction_strategy { + KeyExtractionStrategy::ColumnIndices(indices) => { + KeyExtractionStrategy::ColumnIndices(indices.clone()) + } + KeyExtractionStrategy::CalculatedExpressions(_) => { + KeyExtractionStrategy::CalculatedExpressions(exprs) + } + }; + + Ok(Self { + operator_label: self.operator_label.clone(), + upstream_plan: inputs.remove(0), + extraction_strategy: strategy, + resolved_schema: self.resolved_schema.clone(), + }) + } +} diff --git a/src/sql/logical_node/logical/dylib_udf_config.rs b/src/sql/logical_node/logical/dylib_udf_config.rs new file mode 100644 index 00000000..9bf3368f --- /dev/null +++ b/src/sql/logical_node/logical/dylib_udf_config.rs @@ -0,0 +1,71 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion::arrow::datatypes::DataType; +use datafusion_proto::protobuf::ArrowType; +use prost::Message; +use protocol::function_stream_graph; + +#[derive(Clone, Debug, Eq, PartialEq, Hash, PartialOrd)] +pub struct DylibUdfConfig { + pub dylib_path: String, + pub arg_types: Vec, + pub return_type: DataType, + pub aggregate: bool, + pub is_async: bool, +} + +impl From for function_stream_graph::DylibUdfConfig { + fn from(from: DylibUdfConfig) -> Self { + function_stream_graph::DylibUdfConfig { + dylib_path: from.dylib_path, + arg_types: from + .arg_types + .iter() + .map(|t| { + ArrowType::try_from(t) + .expect("unsupported data type") + .encode_to_vec() + }) + .collect(), + return_type: ArrowType::try_from(&from.return_type) + .expect("unsupported data type") + .encode_to_vec(), + aggregate: from.aggregate, + is_async: from.is_async, + } + } +} + +impl From for DylibUdfConfig { + fn from(from: function_stream_graph::DylibUdfConfig) -> Self { + DylibUdfConfig { + dylib_path: from.dylib_path, + arg_types: from + .arg_types + .iter() + .map(|t| { + DataType::try_from( + &ArrowType::decode(&mut t.as_slice()).expect("invalid arrow type"), + ) + .expect("invalid arrow type") + }) + .collect(), + return_type: DataType::try_from( + &ArrowType::decode(&mut from.return_type.as_slice()).unwrap(), + ) + .expect("invalid arrow type"), + aggregate: from.aggregate, + is_async: from.is_async, + } + } +} diff --git a/src/sql/logical_node/logical/fs_program_convert.rs b/src/sql/logical_node/logical/fs_program_convert.rs new file mode 100644 index 00000000..b05d68f5 --- /dev/null +++ b/src/sql/logical_node/logical/fs_program_convert.rs @@ -0,0 +1,200 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::str::FromStr; +use std::sync::Arc; + +use datafusion::common::{DataFusionError, Result as DFResult}; +use petgraph::graph::DiGraph; +use petgraph::prelude::EdgeRef; +use protocol::function_stream_graph::{ + ChainedOperator, EdgeType as ProtoEdgeType, FsEdge, FsNode, FsProgram, + FsSchema as ProtoFsSchema, +}; + +use crate::sql::api::pipelines::{PipelineEdge, PipelineGraph, PipelineNode}; +use crate::sql::common::FsSchema; + +use super::logical_edge::logical_edge_type_from_proto_i32; +use super::operator_chain::{ChainedLogicalOperator, OperatorChain}; +use super::operator_name::OperatorName; +use super::{LogicalEdge, LogicalNode, LogicalProgram, ProgramConfig}; + +impl TryFrom for LogicalProgram { + type Error = DataFusionError; + + fn try_from(value: FsProgram) -> DFResult { + let mut graph = DiGraph::new(); + let mut id_map = HashMap::with_capacity(value.nodes.len()); + + for node in value.nodes { + let operators = node + .operators + .into_iter() + .map(|op| { + let ChainedOperator { + operator_id, + operator_name: name_str, + operator_config, + } = op; + let operator_name = OperatorName::from_str(&name_str).map_err(|_| { + DataFusionError::Plan(format!("Invalid operator name: {name_str}")) + })?; + Ok(ChainedLogicalOperator { + operator_id, + operator_name, + operator_config, + }) + }) + .collect::>>()?; + + let edges = node + .edges + .into_iter() + .map(|e| { + let fs: FsSchema = e.try_into()?; + Ok(Arc::new(fs)) + }) + .collect::>>()?; + + let logical_node = LogicalNode { + node_id: node.node_id, + description: node.description, + operator_chain: OperatorChain { operators, edges }, + parallelism: node.parallelism as usize, + }; + + id_map.insert(node.node_index, graph.add_node(logical_node)); + } + + for edge in value.edges { + let source = *id_map.get(&edge.source).ok_or_else(|| { + DataFusionError::Plan("Graph integrity error: Missing source node".into()) + })?; + let target = *id_map.get(&edge.target).ok_or_else(|| { + DataFusionError::Plan("Graph integrity error: Missing target node".into()) + })?; + let schema = edge.schema.ok_or_else(|| { + DataFusionError::Plan("Graph integrity error: Missing edge schema".into()) + })?; + let edge_type = logical_edge_type_from_proto_i32(edge.edge_type)?; + + graph.add_edge( + source, + target, + LogicalEdge { + edge_type, + schema: Arc::new(FsSchema::try_from(schema)?), + }, + ); + } + + let program_config = value + .program_config + .map(ProgramConfig::from) + .unwrap_or_default(); + + Ok(LogicalProgram::new(graph, program_config)) + } +} + +impl From for FsProgram { + fn from(value: LogicalProgram) -> Self { + let nodes = value + .graph + .node_indices() + .filter_map(|idx| value.graph.node_weight(idx).map(|node| (idx, node))) + .map(|(idx, node)| FsNode { + node_index: idx.index() as i32, + node_id: node.node_id, + parallelism: node.parallelism as u32, + description: node.description.clone(), + operators: node + .operator_chain + .operators + .iter() + .map(|op| ChainedOperator { + operator_id: op.operator_id.clone(), + operator_name: op.operator_name.to_string(), + operator_config: op.operator_config.clone(), + }) + .collect(), + edges: node + .operator_chain + .edges + .iter() + .map(|edge| ProtoFsSchema::from((**edge).clone())) + .collect(), + }) + .collect(); + + let edges = value + .graph + .edge_indices() + .filter_map(|eidx| { + let edge = value.graph.edge_weight(eidx)?; + let (source, target) = value.graph.edge_endpoints(eidx)?; + Some(FsEdge { + source: source.index() as i32, + target: target.index() as i32, + schema: Some(ProtoFsSchema::from((*edge.schema).clone())), + edge_type: ProtoEdgeType::from(edge.edge_type) as i32, + }) + }) + .collect(); + + FsProgram { + nodes, + edges, + program_config: Some(value.program_config.into()), + } + } +} + +impl TryFrom for PipelineGraph { + type Error = DataFusionError; + + fn try_from(value: LogicalProgram) -> DFResult { + let nodes = value + .graph + .node_weights() + .map(|node| { + Ok(PipelineNode { + node_id: node.node_id, + operator: node.resolve_pipeline_operator_name()?, + description: node.description.clone(), + parallelism: node.parallelism as u32, + }) + }) + .collect::>>()?; + + let edges = value + .graph + .edge_references() + .filter_map(|edge| { + let src = value.graph.node_weight(edge.source())?; + let target = value.graph.node_weight(edge.target())?; + Some(PipelineEdge { + src_id: src.node_id, + dest_id: target.node_id, + key_type: "()".to_string(), + value_type: "()".to_string(), + edge_type: format!("{:?}", edge.weight().edge_type), + }) + }) + .collect(); + + Ok(PipelineGraph { nodes, edges }) + } +} diff --git a/src/sql/logical_node/logical/logical_edge.rs b/src/sql/logical_node/logical/logical_edge.rs new file mode 100644 index 00000000..87950e70 --- /dev/null +++ b/src/sql/logical_node/logical/logical_edge.rs @@ -0,0 +1,102 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::{Display, Formatter}; +use std::sync::Arc; + +use datafusion::common::{DataFusionError, Result}; +use protocol::function_stream_graph::EdgeType as ProtoEdgeType; +use serde::{Deserialize, Serialize}; + +use crate::sql::common::FsSchema; + +#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Ord, Serialize, Deserialize)] +pub enum LogicalEdgeType { + Forward, + Shuffle, + LeftJoin, + RightJoin, +} + +impl Display for LogicalEdgeType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + let symbol = match self { + LogicalEdgeType::Forward => "→", + LogicalEdgeType::Shuffle => "⤨", + LogicalEdgeType::LeftJoin => "-[left]⤨", + LogicalEdgeType::RightJoin => "-[right]⤨", + }; + write!(f, "{symbol}") + } +} + +impl From for LogicalEdgeType { + fn from(value: ProtoEdgeType) -> Self { + match value { + ProtoEdgeType::Unused => { + panic!("Critical: Invalid EdgeType 'Unused' encountered") + } + ProtoEdgeType::Forward => Self::Forward, + ProtoEdgeType::Shuffle => Self::Shuffle, + ProtoEdgeType::LeftJoin => Self::LeftJoin, + ProtoEdgeType::RightJoin => Self::RightJoin, + } + } +} + +impl From for ProtoEdgeType { + fn from(value: LogicalEdgeType) -> Self { + match value { + LogicalEdgeType::Forward => Self::Forward, + LogicalEdgeType::Shuffle => Self::Shuffle, + LogicalEdgeType::LeftJoin => Self::LeftJoin, + LogicalEdgeType::RightJoin => Self::RightJoin, + } + } +} + +pub(crate) fn logical_edge_type_from_proto_i32(i: i32) -> Result { + let e = ProtoEdgeType::try_from(i).map_err(|_| { + DataFusionError::Plan(format!("invalid protobuf EdgeType discriminant {i}")) + })?; + match e { + ProtoEdgeType::Unused => Err(DataFusionError::Plan( + "Critical: Invalid EdgeType 'Unused' encountered".into(), + )), + ProtoEdgeType::Forward => Ok(LogicalEdgeType::Forward), + ProtoEdgeType::Shuffle => Ok(LogicalEdgeType::Shuffle), + ProtoEdgeType::LeftJoin => Ok(LogicalEdgeType::LeftJoin), + ProtoEdgeType::RightJoin => Ok(LogicalEdgeType::RightJoin), + } +} + +#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)] +pub struct LogicalEdge { + pub edge_type: LogicalEdgeType, + pub schema: Arc, +} + +impl LogicalEdge { + pub fn new(edge_type: LogicalEdgeType, schema: FsSchema) -> Self { + LogicalEdge { + edge_type, + schema: Arc::new(schema), + } + } + + pub fn project_all(edge_type: LogicalEdgeType, schema: FsSchema) -> Self { + LogicalEdge { + edge_type, + schema: Arc::new(schema), + } + } +} diff --git a/src/sql/logical_node/logical/logical_graph.rs b/src/sql/logical_node/logical/logical_graph.rs new file mode 100644 index 00000000..b877e2a0 --- /dev/null +++ b/src/sql/logical_node/logical/logical_graph.rs @@ -0,0 +1,30 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use petgraph::graph::DiGraph; + +use super::logical_edge::LogicalEdge; +use super::logical_node::LogicalNode; + +pub type LogicalGraph = DiGraph; + +pub trait Optimizer { + fn optimize_once(&self, plan: &mut LogicalGraph) -> bool; + + fn optimize(&self, plan: &mut LogicalGraph) { + loop { + if !self.optimize_once(plan) { + break; + } + } + } +} diff --git a/src/sql/logical_node/logical/logical_node.rs b/src/sql/logical_node/logical/logical_node.rs new file mode 100644 index 00000000..5f00dc4b --- /dev/null +++ b/src/sql/logical_node/logical/logical_node.rs @@ -0,0 +1,87 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::{Debug, Display, Formatter}; + +use datafusion::common::{DataFusionError, Result}; +use itertools::Itertools; +use serde::{Deserialize, Serialize}; + +use super::operator_chain::{ChainedLogicalOperator, OperatorChain}; +use super::operator_name::OperatorName; + +#[derive(Clone, Serialize, Deserialize)] +pub struct LogicalNode { + pub node_id: u32, + pub description: String, + pub operator_chain: OperatorChain, + pub parallelism: usize, +} + +impl LogicalNode { + pub fn single( + id: u32, + operator_id: String, + name: OperatorName, + config: Vec, + description: String, + parallelism: usize, + ) -> Self { + Self { + node_id: id, + description, + operator_chain: OperatorChain { + operators: vec![ChainedLogicalOperator { + operator_id, + operator_name: name, + operator_config: config, + }], + edges: vec![], + }, + parallelism, + } + } + + pub fn resolve_pipeline_operator_name(&self) -> Result { + let first_op = self.operator_chain.operators.first().ok_or_else(|| { + DataFusionError::Plan("Invalid LogicalNode: Operator chain is empty".into()) + })?; + + if let Some(connector_name) = first_op.extract_connector_name() { + return Ok(connector_name); + } + + if self.operator_chain.len() == 1 { + return Ok(first_op.operator_id.clone()); + } + + Ok("chained_op".to_string()) + } +} + +impl Display for LogicalNode { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.description) + } +} + +impl Debug for LogicalNode { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + let chain_path = self + .operator_chain + .operators + .iter() + .map(|op| op.operator_id.as_str()) + .join(" -> "); + write!(f, "{chain_path}[{}]", self.parallelism) + } +} diff --git a/src/sql/logical_node/logical/logical_program.rs b/src/sql/logical_node/logical/logical_program.rs new file mode 100644 index 00000000..119ac469 --- /dev/null +++ b/src/sql/logical_node/logical/logical_program.rs @@ -0,0 +1,153 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::hash_map::DefaultHasher; +use std::collections::{HashMap, HashSet}; +use std::hash::Hasher; +use std::sync::Arc; + +use datafusion::arrow::datatypes::Schema; +use datafusion::common::{DataFusionError, Result as DFResult}; +use petgraph::Direction; +use petgraph::dot::Dot; +use prost::Message; +use protocol::function_stream_graph::FsProgram; +use rand::distributions::Alphanumeric; +use rand::rngs::SmallRng; +use rand::{Rng, SeedableRng}; + +use super::logical_graph::{LogicalGraph, Optimizer}; +use super::operator_name::OperatorName; +use super::program_config::ProgramConfig; + +#[derive(Clone, Debug, Default)] +pub struct LogicalProgram { + pub graph: LogicalGraph, + pub program_config: ProgramConfig, +} + +impl LogicalProgram { + pub fn new(graph: LogicalGraph, program_config: ProgramConfig) -> Self { + Self { + graph, + program_config, + } + } + + pub fn optimize(&mut self, optimizer: &dyn Optimizer) { + optimizer.optimize(&mut self.graph); + } + + pub fn update_parallelism(&mut self, overrides: &HashMap) { + for node in self.graph.node_weights_mut() { + if let Some(&p) = overrides.get(&node.node_id) { + node.parallelism = p; + } + } + } + + pub fn dot(&self) -> String { + format!("{:?}", Dot::with_config(&self.graph, &[])) + } + + pub fn task_count(&self) -> usize { + self.graph.node_weights().map(|nw| nw.parallelism).sum() + } + + pub fn sources(&self) -> HashSet { + self.graph + .externals(Direction::Incoming) + .filter_map(|idx| self.graph.node_weight(idx)) + .map(|node| node.node_id) + .collect() + } + + pub fn get_hash(&self) -> String { + let mut hasher = DefaultHasher::new(); + let program_bytes = FsProgram::from(self.clone()).encode_to_vec(); + hasher.write(&program_bytes); + let rng = SmallRng::seed_from_u64(hasher.finish()); + rng.sample_iter(&Alphanumeric) + .take(16) + .map(|c| (c as char).to_ascii_lowercase()) + .collect() + } + + pub fn tasks_per_operator(&self) -> HashMap { + self.graph + .node_weights() + .flat_map(|node| { + node.operator_chain + .operators + .iter() + .map(move |op| (op.operator_id.clone(), node.parallelism)) + }) + .collect() + } + + pub fn operator_names_by_id(&self) -> HashMap { + self.graph + .node_weights() + .flat_map(|node| &node.operator_chain.operators) + .map(|op| { + let resolved_name = op + .extract_connector_name() + .unwrap_or_else(|| op.operator_name.to_string()); + (op.operator_id.clone(), resolved_name) + }) + .collect() + } + + pub fn tasks_per_node(&self) -> HashMap { + self.graph + .node_weights() + .map(|node| (node.node_id, node.parallelism)) + .collect() + } + + pub fn features(&self) -> HashSet { + self.graph + .node_weights() + .flat_map(|node| &node.operator_chain.operators) + .filter_map(|op| op.extract_feature()) + .collect() + } + + /// Arrow schema carried on edges into the connector-sink node, if present. + pub fn egress_arrow_schema(&self) -> Option> { + for idx in self.graph.node_indices() { + let node = self.graph.node_weight(idx)?; + if node + .operator_chain + .operators + .iter() + .any(|op| op.operator_name == OperatorName::ConnectorSink) + { + let e = self.graph.edges_directed(idx, Direction::Incoming).next()?; + return Some(Arc::clone(&e.weight().schema.schema)); + } + } + None + } + + pub fn encode_for_catalog(&self) -> DFResult> { + Ok(FsProgram::from(self.clone()).encode_to_vec()) + } + + pub fn decode_for_catalog(bytes: &[u8]) -> DFResult { + let proto = FsProgram::decode(bytes).map_err(|e| { + DataFusionError::Execution(format!("FsProgram catalog decode failed: {e}")) + })?; + LogicalProgram::try_from(proto) + } +} diff --git a/src/sql/logical_node/logical/mod.rs b/src/sql/logical_node/logical/mod.rs new file mode 100644 index 00000000..d2e9a327 --- /dev/null +++ b/src/sql/logical_node/logical/mod.rs @@ -0,0 +1,30 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod dylib_udf_config; +mod fs_program_convert; +mod logical_edge; +mod logical_graph; +mod logical_node; +mod logical_program; +mod operator_chain; +mod operator_name; +mod program_config; +mod python_udf_config; + +pub use dylib_udf_config::DylibUdfConfig; +pub use logical_edge::{LogicalEdge, LogicalEdgeType}; +pub use logical_graph::{LogicalGraph, Optimizer}; +pub use logical_node::LogicalNode; +pub use logical_program::LogicalProgram; +pub use operator_name::OperatorName; +pub use program_config::ProgramConfig; diff --git a/src/sql/logical_node/logical/operator_chain.rs b/src/sql/logical_node/logical/operator_chain.rs new file mode 100644 index 00000000..34a01a5c --- /dev/null +++ b/src/sql/logical_node/logical/operator_chain.rs @@ -0,0 +1,131 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use itertools::{EitherOrBoth, Itertools}; +use prost::Message; +use protocol::function_stream_graph::ConnectorOp; +use serde::{Deserialize, Serialize}; + +use super::operator_name::OperatorName; +use crate::sql::common::FsSchema; + +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct ChainedLogicalOperator { + pub operator_id: String, + pub operator_name: OperatorName, + pub operator_config: Vec, +} + +impl ChainedLogicalOperator { + pub fn extract_connector_name(&self) -> Option { + if matches!( + self.operator_name, + OperatorName::ConnectorSource | OperatorName::ConnectorSink + ) { + ConnectorOp::decode(self.operator_config.as_slice()) + .ok() + .map(|op| op.connector) + } else { + None + } + } + + pub fn extract_feature(&self) -> Option { + match self.operator_name { + OperatorName::AsyncUdf => Some("async-udf".to_string()), + OperatorName::Join => Some("join-with-expiration".to_string()), + OperatorName::InstantJoin => Some("windowed-join".to_string()), + OperatorName::WindowFunction => Some("sql-window-function".to_string()), + OperatorName::LookupJoin => Some("lookup-join".to_string()), + OperatorName::TumblingWindowAggregate => { + Some("sql-tumbling-window-aggregate".to_string()) + } + OperatorName::SlidingWindowAggregate => { + Some("sql-sliding-window-aggregate".to_string()) + } + OperatorName::SessionWindowAggregate => { + Some("sql-session-window-aggregate".to_string()) + } + OperatorName::UpdatingAggregate => Some("sql-updating-aggregate".to_string()), + OperatorName::ConnectorSource => { + self.extract_connector_name().map(|c| format!("{c}-source")) + } + OperatorName::ConnectorSink => { + self.extract_connector_name().map(|c| format!("{c}-sink")) + } + _ => None, + } + } +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct OperatorChain { + pub(crate) operators: Vec, + pub(crate) edges: Vec>, +} + +impl OperatorChain { + pub fn new(operator: ChainedLogicalOperator) -> Self { + Self { + operators: vec![operator], + edges: vec![], + } + } + + pub fn iter(&self) -> impl Iterator>)> { + self.operators + .iter() + .zip_longest(&self.edges) + .filter_map(|e| match e { + EitherOrBoth::Both(op, edge) => Some((op, Some(edge))), + EitherOrBoth::Left(op) => Some((op, None)), + EitherOrBoth::Right(_) => None, + }) + } + + pub fn iter_mut( + &mut self, + ) -> impl Iterator>)> { + self.operators + .iter_mut() + .zip_longest(&self.edges) + .filter_map(|e| match e { + EitherOrBoth::Both(op, edge) => Some((op, Some(edge))), + EitherOrBoth::Left(op) => Some((op, None)), + EitherOrBoth::Right(_) => None, + }) + } + + pub fn first(&self) -> &ChainedLogicalOperator { + self.operators + .first() + .expect("OperatorChain must contain at least one operator") + } + + pub fn len(&self) -> usize { + self.operators.len() + } + + pub fn is_empty(&self) -> bool { + self.operators.is_empty() + } + + pub fn is_source(&self) -> bool { + self.operators[0].operator_name == OperatorName::ConnectorSource + } + + pub fn is_sink(&self) -> bool { + self.operators[0].operator_name == OperatorName::ConnectorSink + } +} diff --git a/src/sql/logical_node/logical/operator_name.rs b/src/sql/logical_node/logical/operator_name.rs new file mode 100644 index 00000000..57f53f90 --- /dev/null +++ b/src/sql/logical_node/logical/operator_name.rs @@ -0,0 +1,82 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::str::FromStr; + +use serde::{Deserialize, Deserializer, Serialize, Serializer}; +use strum::{Display, EnumString, IntoStaticStr}; + +use crate::sql::common::constants::operator_feature; + +#[derive(Clone, Copy, Debug, Eq, PartialEq, EnumString, Display, IntoStaticStr)] +pub enum OperatorName { + ExpressionWatermark, + Value, + KeyBy, + Projection, + AsyncUdf, + Join, + InstantJoin, + LookupJoin, + WindowFunction, + TumblingWindowAggregate, + SlidingWindowAggregate, + SessionWindowAggregate, + UpdatingAggregate, + ConnectorSource, + ConnectorSink, +} + +impl OperatorName { + /// Registry / worker lookup key; matches [`Display`] and protobuf operator names. + #[inline] + pub fn as_registry_key(self) -> &'static str { + self.into() + } + + pub fn feature_tag(self) -> Option<&'static str> { + match self { + Self::ExpressionWatermark | Self::Value | Self::Projection => None, + Self::AsyncUdf => Some(operator_feature::ASYNC_UDF), + Self::Join => Some(operator_feature::JOIN_WITH_EXPIRATION), + Self::InstantJoin => Some(operator_feature::WINDOWED_JOIN), + Self::WindowFunction => Some(operator_feature::SQL_WINDOW_FUNCTION), + Self::LookupJoin => Some(operator_feature::LOOKUP_JOIN), + Self::TumblingWindowAggregate => Some(operator_feature::SQL_TUMBLING_WINDOW_AGGREGATE), + Self::SlidingWindowAggregate => Some(operator_feature::SQL_SLIDING_WINDOW_AGGREGATE), + Self::SessionWindowAggregate => Some(operator_feature::SQL_SESSION_WINDOW_AGGREGATE), + Self::UpdatingAggregate => Some(operator_feature::SQL_UPDATING_AGGREGATE), + Self::KeyBy => Some(operator_feature::KEY_BY_ROUTING), + Self::ConnectorSource => Some(operator_feature::CONNECTOR_SOURCE), + Self::ConnectorSink => Some(operator_feature::CONNECTOR_SINK), + } + } +} + +impl Serialize for OperatorName { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + serializer.serialize_str(&self.to_string()) + } +} + +impl<'de> Deserialize<'de> for OperatorName { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + let s = String::deserialize(deserializer)?; + Self::from_str(&s).map_err(serde::de::Error::custom) + } +} diff --git a/src/sql/logical_node/logical/program_config.rs b/src/sql/logical_node/logical/program_config.rs new file mode 100644 index 00000000..177326f4 --- /dev/null +++ b/src/sql/logical_node/logical/program_config.rs @@ -0,0 +1,33 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use protocol::function_stream_graph::FsProgramConfig; + +/// Placeholder program-level config (UDF tables live elsewhere; wire maps stay empty). +#[derive(Clone, Debug, Default)] +pub struct ProgramConfig {} + +impl From for FsProgramConfig { + fn from(_: ProgramConfig) -> Self { + Self { + udf_dylibs: Default::default(), + python_udfs: Default::default(), + } + } +} + +impl From for ProgramConfig { + fn from(_: FsProgramConfig) -> Self { + Self::default() + } +} diff --git a/src/sql/logical_node/logical/python_udf_config.rs b/src/sql/logical_node/logical/python_udf_config.rs new file mode 100644 index 00000000..6e7d5c66 --- /dev/null +++ b/src/sql/logical_node/logical/python_udf_config.rs @@ -0,0 +1,23 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use datafusion::arrow::datatypes::DataType; + +#[derive(Clone, Debug, Eq, PartialEq, Hash)] +pub struct PythonUdfConfig { + pub arg_types: Vec, + pub return_type: DataType, + pub name: Arc, + pub definition: Arc, +} diff --git a/src/sql/logical_node/lookup.rs b/src/sql/logical_node/lookup.rs new file mode 100644 index 00000000..00f624a7 --- /dev/null +++ b/src/sql/logical_node/lookup.rs @@ -0,0 +1,270 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::fmt::Formatter; +use std::sync::Arc; + +use datafusion::common::{Column, DFSchemaRef, JoinType, Result, internal_err, plan_err}; +use datafusion::logical_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore}; +use datafusion::sql::TableReference; +use datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec; +use datafusion_proto::physical_plan::to_proto::serialize_physical_expr; +use prost::Message; + +use protocol::function_stream_graph; +use protocol::function_stream_graph::{ + ConnectorOp, GenericConnectorConfig, LookupJoinCondition, LookupJoinOperator, +}; + +use crate::multifield_partial_ord; +use crate::sql::common::constants::extension_node; +use crate::sql::common::{FsSchema, FsSchemaRef}; +use crate::sql::logical_node::logical::{LogicalEdge, LogicalEdgeType, LogicalNode, OperatorName}; +use crate::sql::logical_node::{CompiledTopologyNode, StreamingOperatorBlueprint}; +use crate::sql::logical_planner::planner::{NamedNode, Planner}; +use crate::sql::schema::SourceTable; +use crate::sql::schema::utils::add_timestamp_field_arrow; + +pub const DICTIONARY_SOURCE_NODE_NAME: &str = extension_node::REFERENCE_TABLE_SOURCE; +pub const STREAM_DICTIONARY_JOIN_NODE_NAME: &str = extension_node::STREAM_REFERENCE_JOIN; + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct ReferenceTableSourceNode { + pub(crate) source_definition: SourceTable, + pub(crate) resolved_schema: DFSchemaRef, +} + +multifield_partial_ord!(ReferenceTableSourceNode, source_definition); + +impl UserDefinedLogicalNodeCore for ReferenceTableSourceNode { + fn name(&self) -> &str { + DICTIONARY_SOURCE_NODE_NAME + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![] + } + + fn schema(&self) -> &DFSchemaRef { + &self.resolved_schema + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result { + write!(f, "ReferenceTableSource: Schema={}", self.resolved_schema) + } + + fn with_exprs_and_inputs(&self, _exprs: Vec, inputs: Vec) -> Result { + if !inputs.is_empty() { + return internal_err!( + "ReferenceTableSource is a leaf node and cannot accept upstream inputs" + ); + } + + Ok(Self { + source_definition: self.source_definition.clone(), + resolved_schema: self.resolved_schema.clone(), + }) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct StreamReferenceJoinNode { + pub(crate) upstream_stream_plan: LogicalPlan, + pub(crate) output_schema: DFSchemaRef, + pub(crate) external_dictionary: SourceTable, + pub(crate) equijoin_conditions: Vec<(Expr, Column)>, + pub(crate) post_join_filter: Option, + pub(crate) namespace_alias: Option, + pub(crate) join_semantics: JoinType, +} + +multifield_partial_ord!( + StreamReferenceJoinNode, + upstream_stream_plan, + external_dictionary, + equijoin_conditions, + post_join_filter, + namespace_alias +); + +impl StreamReferenceJoinNode { + fn compile_join_conditions(&self, planner: &Planner) -> Result> { + self.equijoin_conditions + .iter() + .map(|(logical_left_expr, right_column)| { + let physical_expr = + planner.create_physical_expr(logical_left_expr, &self.output_schema)?; + let serialized_expr = + serialize_physical_expr(&physical_expr, &DefaultPhysicalExtensionCodec {})?; + + Ok(LookupJoinCondition { + left_expr: serialized_expr.encode_to_vec(), + right_key: right_column.name.clone(), + }) + }) + .collect() + } + + fn map_api_join_type(&self) -> Result { + match self.join_semantics { + JoinType::Inner => Ok(function_stream_graph::JoinType::Inner as i32), + JoinType::Left => Ok(function_stream_graph::JoinType::Left as i32), + unsupported => plan_err!( + "Unsupported join type '{unsupported}' for dictionary lookups. Only INNER and LEFT joins are permitted." + ), + } + } + + fn build_engine_operator( + &self, + planner: &Planner, + _upstream_schema: &FsSchemaRef, + ) -> Result { + let internal_input_schema = + FsSchema::from_schema_unkeyed(Arc::new(self.output_schema.as_ref().into()))?; + let dictionary_physical_schema = self.external_dictionary.produce_physical_schema(); + let lookup_fs_schema = + FsSchema::from_schema_unkeyed(add_timestamp_field_arrow(dictionary_physical_schema))?; + + let properties: HashMap = self + .external_dictionary + .catalog_with_options + .iter() + .map(|(k, v)| (k.clone(), v.clone())) + .collect(); + + Ok(LookupJoinOperator { + input_schema: Some(internal_input_schema.into()), + lookup_schema: Some(lookup_fs_schema.clone().into()), + connector: Some(ConnectorOp { + connector: self.external_dictionary.adapter_type.clone(), + fs_schema: Some(lookup_fs_schema.into()), + name: self.external_dictionary.table_identifier.clone(), + description: self.external_dictionary.description.clone(), + config: Some( + protocol::function_stream_graph::connector_op::Config::Generic( + GenericConnectorConfig { properties }, + ), + ), + }), + key_exprs: self.compile_join_conditions(planner)?, + join_type: self.map_api_join_type()?, + ttl_micros: self + .external_dictionary + .lookup_cache_ttl + .map(|t| t.as_micros() as u64), + max_capacity_bytes: self.external_dictionary.lookup_cache_max_bytes, + }) + } +} + +impl StreamingOperatorBlueprint for StreamReferenceJoinNode { + fn operator_identity(&self) -> Option { + None + } + + fn compile_to_graph_node( + &self, + planner: &Planner, + node_index: usize, + mut input_schemas: Vec, + ) -> Result { + if input_schemas.len() != 1 { + return plan_err!("StreamReferenceJoinNode requires exactly one upstream stream input"); + } + let upstream_schema = input_schemas.remove(0); + + let operator_config = self.build_engine_operator(planner, &upstream_schema)?; + + let logical_node = LogicalNode::single( + node_index as u32, + format!("lookup_join_{node_index}"), + OperatorName::LookupJoin, + operator_config.encode_to_vec(), + format!( + "DictionaryJoin<{}>", + self.external_dictionary.table_identifier + ), + 1, + ); + + let incoming_edge = + LogicalEdge::project_all(LogicalEdgeType::Shuffle, (*upstream_schema).clone()); + + Ok(CompiledTopologyNode { + execution_unit: logical_node, + routing_edges: vec![incoming_edge], + }) + } + + fn yielded_schema(&self) -> FsSchema { + FsSchema::from_schema_unkeyed(self.output_schema.inner().clone()) + .expect("Failed to convert lookup join output schema to FsSchema") + } +} + +impl UserDefinedLogicalNodeCore for StreamReferenceJoinNode { + fn name(&self) -> &str { + STREAM_DICTIONARY_JOIN_NODE_NAME + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.upstream_stream_plan] + } + + fn schema(&self) -> &DFSchemaRef { + &self.output_schema + } + + fn expressions(&self) -> Vec { + let mut exprs: Vec<_> = self + .equijoin_conditions + .iter() + .map(|(l, _)| l.clone()) + .collect(); + if let Some(filter) = &self.post_join_filter { + exprs.push(filter.clone()); + } + exprs + } + + fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result { + write!( + f, + "StreamReferenceJoin: join_type={:?} | {}", + self.join_semantics, self.output_schema + ) + } + + fn with_exprs_and_inputs(&self, _: Vec, inputs: Vec) -> Result { + if inputs.len() != 1 { + return internal_err!( + "StreamReferenceJoinNode expects exactly 1 upstream plan, got {}", + inputs.len() + ); + } + Ok(Self { + upstream_stream_plan: inputs[0].clone(), + output_schema: self.output_schema.clone(), + external_dictionary: self.external_dictionary.clone(), + equijoin_conditions: self.equijoin_conditions.clone(), + post_join_filter: self.post_join_filter.clone(), + namespace_alias: self.namespace_alias.clone(), + join_semantics: self.join_semantics, + }) + } +} diff --git a/src/sql/logical_node/macros.rs b/src/sql/logical_node/macros.rs new file mode 100644 index 00000000..4ce649c2 --- /dev/null +++ b/src/sql/logical_node/macros.rs @@ -0,0 +1,28 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#[macro_export] +macro_rules! multifield_partial_ord { + ($ty:ty, $($field:tt), *) => { + impl PartialOrd for $ty { + fn partial_cmp(&self, other: &Self) -> Option { + $( + let cmp = self.$field.partial_cmp(&other.$field)?; + if cmp != std::cmp::Ordering::Equal { + return Some(cmp); + } + )* + Some(std::cmp::Ordering::Equal) + } + } + }; +} diff --git a/src/sql/logical_node/mod.rs b/src/sql/logical_node/mod.rs new file mode 100644 index 00000000..2da1b8a4 --- /dev/null +++ b/src/sql/logical_node/mod.rs @@ -0,0 +1,42 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod logical; + +mod macros; + +pub(crate) mod streaming_operator_blueprint; +pub(crate) use streaming_operator_blueprint::{CompiledTopologyNode, StreamingOperatorBlueprint}; + +pub(crate) mod aggregate; +pub(crate) mod debezium; +pub(crate) mod join; +pub(crate) mod key_calculation; +pub(crate) mod lookup; +pub(crate) mod projection; +pub(crate) mod remote_table; +pub(crate) mod sink; +pub(crate) mod table_source; +pub(crate) mod updating_aggregate; +pub(crate) mod watermark_node; +pub(crate) mod windows_function; + +pub(crate) mod timestamp_append; +pub(crate) use timestamp_append::SystemTimestampInjectorNode; + +pub(crate) mod async_udf; +pub(crate) use async_udf::AsyncFunctionExecutionNode; + +pub(crate) mod is_retract; +pub(crate) use is_retract::IsRetractExtension; + +mod extension_try_from; diff --git a/src/sql/logical_node/projection.rs b/src/sql/logical_node/projection.rs new file mode 100644 index 00000000..df55e575 --- /dev/null +++ b/src/sql/logical_node/projection.rs @@ -0,0 +1,239 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Formatter; +use std::sync::Arc; + +use datafusion::common::{DFSchema, DFSchemaRef, Result, internal_err}; +use datafusion::logical_expr::{Expr, ExprSchemable, LogicalPlan, UserDefinedLogicalNodeCore}; +use datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec; +use datafusion_proto::physical_plan::to_proto::serialize_physical_expr; +use prost::Message; + +use protocol::function_stream_graph::ProjectionOperator; + +use crate::multifield_partial_ord; +use crate::sql::common::constants::{extension_node, sql_field}; +use crate::sql::common::{FsSchema, FsSchemaRef}; +use crate::sql::logical_node::logical::{LogicalEdge, LogicalEdgeType, LogicalNode, OperatorName}; +use crate::sql::logical_node::{CompiledTopologyNode, StreamingOperatorBlueprint}; +use crate::sql::logical_planner::planner::{NamedNode, Planner}; +use crate::sql::types::{QualifiedField, build_df_schema}; + +// ----------------------------------------------------------------------------- +// Constants & Identifiers +// ----------------------------------------------------------------------------- + +pub(crate) const STREAM_PROJECTION_NODE_NAME: &str = extension_node::STREAM_PROJECTION; +const DEFAULT_PROJECTION_LABEL: &str = sql_field::DEFAULT_PROJECTION_LABEL; + +// ----------------------------------------------------------------------------- +// Logical Node Definition +// ----------------------------------------------------------------------------- + +/// Projection within a streaming execution topology. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) struct StreamProjectionNode { + pub(crate) upstream_plans: Vec, + pub(crate) operator_label: Option, + pub(crate) projection_exprs: Vec, + pub(crate) resolved_schema: DFSchemaRef, + pub(crate) requires_shuffle: bool, +} + +multifield_partial_ord!(StreamProjectionNode, operator_label, projection_exprs); + +impl StreamProjectionNode { + pub(crate) fn try_new( + upstream_plans: Vec, + operator_label: Option, + projection_exprs: Vec, + ) -> Result { + if upstream_plans.is_empty() { + return internal_err!("StreamProjectionNode requires at least one upstream plan"); + } + let primary_input = &upstream_plans[0]; + let upstream_schema = primary_input.schema(); + + let mut projected_fields = Vec::with_capacity(projection_exprs.len()); + for logical_expr in &projection_exprs { + let arrow_field = logical_expr.to_field(upstream_schema)?; + projected_fields.push(QualifiedField::from(arrow_field)); + } + + let resolved_schema = Arc::new(build_df_schema(&projected_fields)?); + + Ok(Self { + upstream_plans, + operator_label, + projection_exprs, + resolved_schema, + requires_shuffle: false, + }) + } + + pub(crate) fn with_shuffle_routing(mut self) -> Self { + self.requires_shuffle = true; + self + } + + fn validate_uniform_schemas(input_schemas: &[FsSchemaRef]) -> Result { + if input_schemas.is_empty() { + return internal_err!("No input schemas provided to projection planner"); + } + let primary_schema = input_schemas[0].clone(); + + for schema in input_schemas.iter().skip(1) { + if **schema != *primary_schema { + return internal_err!( + "Schema mismatch: All upstream inputs to a projection node must share the identical schema topology." + ); + } + } + + Ok(primary_schema) + } + + fn compile_physical_expressions( + &self, + planner: &Planner, + input_df_schema: &DFSchemaRef, + ) -> Result>> { + self.projection_exprs + .iter() + .map(|logical_expr| { + let physical_expr = planner + .create_physical_expr(logical_expr, input_df_schema) + .map_err(|e| e.context("Failed to compile physical projection expression"))?; + + let serialized_expr = + serialize_physical_expr(&physical_expr, &DefaultPhysicalExtensionCodec {})?; + + Ok(serialized_expr.encode_to_vec()) + }) + .collect() + } +} + +// ----------------------------------------------------------------------------- +// Stream Extension Trait Implementation +// ----------------------------------------------------------------------------- + +impl StreamingOperatorBlueprint for StreamProjectionNode { + fn operator_identity(&self) -> Option { + None + } + + fn compile_to_graph_node( + &self, + planner: &Planner, + node_index: usize, + input_schemas: Vec, + ) -> Result { + let unified_input_schema = Self::validate_uniform_schemas(&input_schemas)?; + let input_df_schema = Arc::new(DFSchema::try_from( + unified_input_schema.schema.as_ref().clone(), + )?); + + let compiled_expr_payloads = + self.compile_physical_expressions(planner, &input_df_schema)?; + + let operator_config = ProjectionOperator { + name: self + .operator_label + .as_deref() + .unwrap_or(DEFAULT_PROJECTION_LABEL) + .to_string(), + input_schema: Some(unified_input_schema.as_ref().clone().into()), + output_schema: Some(self.yielded_schema().into()), + exprs: compiled_expr_payloads, + }; + + let node_identifier = format!("projection_{node_index}"); + let label = format!( + "ArrowProjection<{}>", + self.operator_label.as_deref().unwrap_or("_") + ); + + let logical_node = LogicalNode::single( + node_index as u32, + node_identifier, + OperatorName::Projection, + operator_config.encode_to_vec(), + label, + 1, + ); + + let routing_strategy = if self.requires_shuffle { + LogicalEdgeType::Shuffle + } else { + LogicalEdgeType::Forward + }; + + let outgoing_edge = + LogicalEdge::project_all(routing_strategy, (*unified_input_schema).clone()); + + Ok(CompiledTopologyNode { + execution_unit: logical_node, + routing_edges: vec![outgoing_edge], + }) + } + + fn yielded_schema(&self) -> FsSchema { + FsSchema::from_schema_unkeyed(Arc::new(self.resolved_schema.as_arrow().clone())) + .expect("Fatal: Failed to generate unkeyed output schema for projection") + } +} + +// ----------------------------------------------------------------------------- +// DataFusion Logical Node Hooks +// ----------------------------------------------------------------------------- + +impl UserDefinedLogicalNodeCore for StreamProjectionNode { + fn name(&self) -> &str { + STREAM_PROJECTION_NODE_NAME + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + self.upstream_plans.iter().collect() + } + + fn schema(&self) -> &DFSchemaRef { + &self.resolved_schema + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result { + write!( + f, + "StreamProjectionNode: RequiresShuffle={}, Schema={}", + self.requires_shuffle, self.resolved_schema + ) + } + + fn with_exprs_and_inputs(&self, _exprs: Vec, inputs: Vec) -> Result { + let mut new_node = Self::try_new( + inputs, + self.operator_label.clone(), + self.projection_exprs.clone(), + )?; + + if self.requires_shuffle { + new_node = new_node.with_shuffle_routing(); + } + + Ok(new_node) + } +} diff --git a/src/sql/logical_node/remote_table.rs b/src/sql/logical_node/remote_table.rs new file mode 100644 index 00000000..d43a87e0 --- /dev/null +++ b/src/sql/logical_node/remote_table.rs @@ -0,0 +1,190 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Formatter; +use std::sync::Arc; + +use datafusion::common::{DFSchemaRef, Result, TableReference, internal_err, plan_err}; +use datafusion::logical_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore}; +use datafusion_proto::physical_plan::AsExecutionPlan; +use datafusion_proto::protobuf::PhysicalPlanNode; +use prost::Message; + +use protocol::function_stream_graph::ValuePlanOperator; + +use crate::multifield_partial_ord; +use crate::sql::common::constants::extension_node; +use crate::sql::common::{FsSchema, FsSchemaRef}; +use crate::sql::logical_node::logical::{LogicalEdge, LogicalEdgeType, LogicalNode, OperatorName}; +use crate::sql::logical_node::{CompiledTopologyNode, StreamingOperatorBlueprint}; +use crate::sql::logical_planner::planner::{NamedNode, Planner}; +use crate::sql::physical::StreamingExtensionCodec; + +// ----------------------------------------------------------------------------- +// Constants & Identifiers +// ----------------------------------------------------------------------------- + +pub(crate) const REMOTE_TABLE_NODE_NAME: &str = extension_node::REMOTE_TABLE_BOUNDARY; + +// ----------------------------------------------------------------------------- +// Logical Node Definition +// ----------------------------------------------------------------------------- + +/// Segments the execution graph and merges nodes sharing the same identifier; acts as a boundary. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) struct RemoteTableBoundaryNode { + pub(crate) upstream_plan: LogicalPlan, + pub(crate) table_identifier: TableReference, + pub(crate) resolved_schema: DFSchemaRef, + pub(crate) requires_materialization: bool, +} + +multifield_partial_ord!( + RemoteTableBoundaryNode, + upstream_plan, + table_identifier, + requires_materialization +); + +impl RemoteTableBoundaryNode { + fn compile_engine_operator(&self, planner: &Planner) -> Result> { + let physical_plan = planner.sync_plan(&self.upstream_plan)?; + + let physical_plan_proto = PhysicalPlanNode::try_from_physical_plan( + physical_plan, + &StreamingExtensionCodec::default(), + )?; + + let operator_config = ValuePlanOperator { + name: format!("value_calculation({})", self.table_identifier), + physical_plan: physical_plan_proto.encode_to_vec(), + }; + + Ok(operator_config.encode_to_vec()) + } + + fn validate_uniform_schemas(input_schemas: &[FsSchemaRef]) -> Result<()> { + if input_schemas.len() <= 1 { + return Ok(()); + } + + let primary_schema = &input_schemas[0]; + for schema in input_schemas.iter().skip(1) { + if *schema != *primary_schema { + return plan_err!( + "Topology error: Multiple input streams routed to the same remote table must share an identical schema structure." + ); + } + } + + Ok(()) + } +} + +// ----------------------------------------------------------------------------- +// Stream Extension Trait Implementation +// ----------------------------------------------------------------------------- + +impl StreamingOperatorBlueprint for RemoteTableBoundaryNode { + fn operator_identity(&self) -> Option { + if self.requires_materialization { + Some(NamedNode::RemoteTable(self.table_identifier.clone())) + } else { + None + } + } + + fn compile_to_graph_node( + &self, + planner: &Planner, + node_index: usize, + input_schemas: Vec, + ) -> Result { + Self::validate_uniform_schemas(&input_schemas)?; + + let operator_payload = self.compile_engine_operator(planner)?; + + let logical_node = LogicalNode::single( + node_index as u32, + format!("value_{node_index}"), + OperatorName::Value, + operator_payload, + self.table_identifier.to_string(), + 1, + ); + + let routing_edges: Vec = input_schemas + .into_iter() + .map(|schema| LogicalEdge::project_all(LogicalEdgeType::Forward, (*schema).clone())) + .collect(); + + Ok(CompiledTopologyNode { + execution_unit: logical_node, + routing_edges, + }) + } + + fn yielded_schema(&self) -> FsSchema { + FsSchema::from_schema_keys(Arc::new(self.resolved_schema.as_ref().into()), vec![]) + .expect("Fatal: Failed to generate output schema for remote table boundary") + } +} + +// ----------------------------------------------------------------------------- +// DataFusion Logical Node Hooks +// ----------------------------------------------------------------------------- + +impl UserDefinedLogicalNodeCore for RemoteTableBoundaryNode { + fn name(&self) -> &str { + REMOTE_TABLE_NODE_NAME + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.upstream_plan] + } + + fn schema(&self) -> &DFSchemaRef { + &self.resolved_schema + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result { + write!( + f, + "RemoteTableBoundaryNode: Identifier={}, Materialized={}, Schema={}", + self.table_identifier, self.requires_materialization, self.resolved_schema + ) + } + + fn with_exprs_and_inputs( + &self, + _exprs: Vec, + mut inputs: Vec, + ) -> Result { + if inputs.len() != 1 { + return internal_err!( + "RemoteTableBoundaryNode expects exactly 1 upstream logical plan, but received {}", + inputs.len() + ); + } + + Ok(Self { + upstream_plan: inputs.remove(0), + table_identifier: self.table_identifier.clone(), + resolved_schema: self.resolved_schema.clone(), + requires_materialization: self.requires_materialization, + }) + } +} diff --git a/src/sql/logical_node/sink.rs b/src/sql/logical_node/sink.rs new file mode 100644 index 00000000..dbfcaa55 --- /dev/null +++ b/src/sql/logical_node/sink.rs @@ -0,0 +1,228 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Formatter; +use std::sync::Arc; + +use datafusion::common::{DFSchemaRef, Result, TableReference, plan_err}; +use datafusion::logical_expr::{Expr, Extension, LogicalPlan, UserDefinedLogicalNodeCore}; +use prost::Message; + +use crate::multifield_partial_ord; +use crate::sql::common::constants::extension_node; +use crate::sql::common::{FsSchema, FsSchemaRef, UPDATING_META_FIELD}; +use crate::sql::logical_node::logical::{LogicalEdge, LogicalEdgeType, LogicalNode, OperatorName}; +use crate::sql::logical_node::{CompiledTopologyNode, StreamingOperatorBlueprint}; +use crate::sql::logical_planner::planner::{NamedNode, Planner}; +use crate::sql::schema::Table; + +use super::debezium::PackDebeziumEnvelopeNode; +use super::remote_table::RemoteTableBoundaryNode; + +// ----------------------------------------------------------------------------- +// Constants & Identifiers +// ----------------------------------------------------------------------------- + +pub(crate) const STREAM_EGRESS_NODE_NAME: &str = extension_node::STREAM_EGRESS; + +// ----------------------------------------------------------------------------- +// Logical Node Definition +// ----------------------------------------------------------------------------- + +/// Terminal node routing processed data into an external sink (e.g. Kafka, PostgreSQL). +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) struct StreamEgressNode { + pub(crate) target_identifier: TableReference, + pub(crate) destination_table: Table, + pub(crate) egress_schema: DFSchemaRef, + upstream_plans: Arc>, +} + +multifield_partial_ord!(StreamEgressNode, target_identifier, upstream_plans); + +impl StreamEgressNode { + pub fn try_new( + target_identifier: TableReference, + destination_table: Table, + initial_schema: DFSchemaRef, + upstream_plan: LogicalPlan, + ) -> Result { + let (mut processed_plan, mut resolved_schema) = + Self::apply_cdc_transformations(upstream_plan, initial_schema, &destination_table)?; + + Self::enforce_computational_boundary(&mut resolved_schema, &mut processed_plan); + + Ok(Self { + target_identifier, + destination_table, + egress_schema: resolved_schema, + upstream_plans: Arc::new(vec![processed_plan]), + }) + } + + fn apply_cdc_transformations( + plan: LogicalPlan, + schema: DFSchemaRef, + destination: &Table, + ) -> Result<(LogicalPlan, DFSchemaRef)> { + let is_upstream_updating = plan + .schema() + .has_column_with_unqualified_name(UPDATING_META_FIELD); + + match destination { + Table::ConnectorTable(connector) => { + let is_sink_updating = connector.is_updating(); + + match (is_upstream_updating, is_sink_updating) { + (_, true) => { + let debezium_encoder = PackDebeziumEnvelopeNode::try_new(plan)?; + let wrapped_plan = LogicalPlan::Extension(Extension { + node: Arc::new(debezium_encoder), + }); + let new_schema = wrapped_plan.schema().clone(); + + Ok((wrapped_plan, new_schema)) + } + (true, false) => { + plan_err!( + "Topology Mismatch: The upstream is producing an updating stream (CDC), \ + but the target sink '{}' is not configured to accept updates. \ + Hint: set `format = 'debezium_json'` in the WITH clause.", + connector.name() + ) + } + (false, false) => Ok((plan, schema)), + } + } + Table::LookupTable(..) => { + plan_err!( + "Topology Violation: A Lookup Table cannot be used as a streaming data sink." + ) + } + Table::TableFromQuery { .. } => Ok((plan, schema)), + } + } + + fn enforce_computational_boundary(schema: &mut DFSchemaRef, plan: &mut LogicalPlan) { + let requires_boundary = if let LogicalPlan::Extension(extension) = plan { + let stream_ext: &dyn StreamingOperatorBlueprint = (&extension.node) + .try_into() + .expect("Fatal: Egress node encountered an extension that does not implement StreamingOperatorBlueprint"); + + stream_ext.is_passthrough_boundary() + } else { + true + }; + + if requires_boundary { + let boundary_node = RemoteTableBoundaryNode { + upstream_plan: plan.clone(), + table_identifier: TableReference::bare("sink projection"), + resolved_schema: schema.clone(), + requires_materialization: false, + }; + + *plan = LogicalPlan::Extension(Extension { + node: Arc::new(boundary_node), + }); + } + } +} + +// ----------------------------------------------------------------------------- +// Stream Extension Trait Implementation +// ----------------------------------------------------------------------------- + +impl StreamingOperatorBlueprint for StreamEgressNode { + fn operator_identity(&self) -> Option { + Some(NamedNode::Sink(self.target_identifier.clone())) + } + + fn compile_to_graph_node( + &self, + _planner: &Planner, + node_index: usize, + input_schemas: Vec, + ) -> Result { + let connector_operator = self + .destination_table + .connector_op() + .map_err(|e| e.context("Failed to generate connector operation payload"))?; + + let operator_description = connector_operator.description.clone(); + let operator_payload = connector_operator.encode_to_vec(); + + let logical_node = LogicalNode::single( + node_index as u32, + format!("sink_{}_{node_index}", self.target_identifier), + OperatorName::ConnectorSink, + operator_payload, + operator_description, + 1, + ); + + let routing_edges: Vec = input_schemas + .into_iter() + .map(|input_schema| { + LogicalEdge::project_all(LogicalEdgeType::Forward, (*input_schema).clone()) + }) + .collect(); + + Ok(CompiledTopologyNode { + execution_unit: logical_node, + routing_edges, + }) + } + + fn yielded_schema(&self) -> FsSchema { + FsSchema::from_fields(vec![]) + } +} + +// ----------------------------------------------------------------------------- +// DataFusion Logical Node Hooks +// ----------------------------------------------------------------------------- + +impl UserDefinedLogicalNodeCore for StreamEgressNode { + fn name(&self) -> &str { + STREAM_EGRESS_NODE_NAME + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + self.upstream_plans.iter().collect() + } + + fn schema(&self) -> &DFSchemaRef { + &self.egress_schema + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result { + write!( + f, + "StreamEgressNode({:?}): Schema={}", + self.target_identifier, self.egress_schema + ) + } + + fn with_exprs_and_inputs(&self, _exprs: Vec, inputs: Vec) -> Result { + Ok(Self { + target_identifier: self.target_identifier.clone(), + destination_table: self.destination_table.clone(), + egress_schema: self.egress_schema.clone(), + upstream_plans: Arc::new(inputs), + }) + } +} diff --git a/src/sql/logical_node/streaming_operator_blueprint.rs b/src/sql/logical_node/streaming_operator_blueprint.rs new file mode 100644 index 00000000..d3f9d459 --- /dev/null +++ b/src/sql/logical_node/streaming_operator_blueprint.rs @@ -0,0 +1,65 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Debug; + +use datafusion::common::Result; + +use crate::sql::common::{FsSchema, FsSchemaRef}; +use crate::sql::logical_node::logical::{LogicalEdge, LogicalNode}; +use crate::sql::logical_planner::planner::{NamedNode, Planner}; + +// ----------------------------------------------------------------------------- +// Core Execution Blueprint +// ----------------------------------------------------------------------------- + +/// Atomic unit within a streaming execution topology: translates streaming SQL into graph nodes. +pub(crate) trait StreamingOperatorBlueprint: Debug { + /// Canonical named identity for this operator, if any (sources, sinks, etc.). + fn operator_identity(&self) -> Option; + + /// Compiles this operator into a graph vertex and its incoming routing edges. + fn compile_to_graph_node( + &self, + compiler_context: &Planner, + node_id_sequence: usize, + upstream_schemas: Vec, + ) -> Result; + + /// Schema of records this operator yields downstream. + fn yielded_schema(&self) -> FsSchema; + + /// Logical passthrough boundary (no physical state change); default is stateful / materializing. + fn is_passthrough_boundary(&self) -> bool { + false + } +} + +// ----------------------------------------------------------------------------- +// Graph Topology Structures +// ----------------------------------------------------------------------------- + +/// Compiled vertex: execution unit plus upstream routing edges. +#[derive(Debug, Clone)] +pub(crate) struct CompiledTopologyNode { + pub execution_unit: LogicalNode, + pub routing_edges: Vec, +} + +impl CompiledTopologyNode { + pub fn new(execution_unit: LogicalNode, routing_edges: Vec) -> Self { + Self { + execution_unit, + routing_edges, + } + } +} diff --git a/src/sql/logical_node/table_source.rs b/src/sql/logical_node/table_source.rs new file mode 100644 index 00000000..65f4459f --- /dev/null +++ b/src/sql/logical_node/table_source.rs @@ -0,0 +1,180 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Formatter; +use std::sync::Arc; + +use datafusion::common::{DFSchemaRef, Result, TableReference, plan_err}; +use datafusion::logical_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore}; +use prost::Message; + +use crate::multifield_partial_ord; +use crate::sql::common::constants::extension_node; +use crate::sql::common::{FsSchema, FsSchemaRef}; +use crate::sql::logical_node::debezium::DebeziumSchemaCodec; +use crate::sql::logical_node::logical::{LogicalNode, OperatorName}; +use crate::sql::logical_planner::planner::{NamedNode, Planner}; +use crate::sql::schema::SourceTable; +use crate::sql::schema::utils::add_timestamp_field; +use crate::sql::types::build_df_schema; + +use super::{CompiledTopologyNode, StreamingOperatorBlueprint}; + +// ----------------------------------------------------------------------------- +// Constants & Identifiers +// ----------------------------------------------------------------------------- + +pub(crate) const STREAM_INGESTION_NODE_NAME: &str = extension_node::STREAM_INGESTION; + +// ----------------------------------------------------------------------------- +// Logical Node Definition +// ----------------------------------------------------------------------------- + +/// Foundational ingestion point: connects to external systems and injects raw or CDC data. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) struct StreamIngestionNode { + pub(crate) source_identifier: TableReference, + pub(crate) source_definition: SourceTable, + pub(crate) resolved_schema: DFSchemaRef, +} + +multifield_partial_ord!(StreamIngestionNode, source_identifier, source_definition); + +impl StreamIngestionNode { + pub fn try_new( + source_identifier: TableReference, + source_definition: SourceTable, + ) -> Result { + let resolved_schema = Self::build_ingestion_schema(&source_identifier, &source_definition)?; + + Ok(Self { + source_identifier, + source_definition, + resolved_schema, + }) + } + + fn build_ingestion_schema( + identifier: &TableReference, + definition: &SourceTable, + ) -> Result { + let physical_fields: Vec<_> = definition + .schema_specs + .iter() + .filter(|col| !col.is_computed()) + .map(|col| { + ( + Some(identifier.clone()), + Arc::new(col.arrow_field().clone()), + ) + .into() + }) + .collect(); + + let base_schema = Arc::new(build_df_schema(&physical_fields)?); + + let enveloped_schema = if definition.is_updating() { + DebeziumSchemaCodec::wrap_into_envelope(&base_schema, Some(identifier.clone()))? + } else { + base_schema + }; + + add_timestamp_field(enveloped_schema, Some(identifier.clone())) + } +} + +// ----------------------------------------------------------------------------- +// DataFusion Logical Node Hooks +// ----------------------------------------------------------------------------- + +impl UserDefinedLogicalNodeCore for StreamIngestionNode { + fn name(&self) -> &str { + STREAM_INGESTION_NODE_NAME + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![] + } + + fn schema(&self) -> &DFSchemaRef { + &self.resolved_schema + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result { + write!( + f, + "StreamIngestionNode({}): Schema={}", + self.source_identifier, self.resolved_schema + ) + } + + fn with_exprs_and_inputs(&self, _exprs: Vec, inputs: Vec) -> Result { + if !inputs.is_empty() { + return plan_err!( + "StreamIngestionNode acts as a leaf boundary and cannot accept upstream inputs." + ); + } + + Ok(Self { + source_identifier: self.source_identifier.clone(), + source_definition: self.source_definition.clone(), + resolved_schema: self.resolved_schema.clone(), + }) + } +} + +// ----------------------------------------------------------------------------- +// Core Execution Blueprint Implementation +// ----------------------------------------------------------------------------- + +impl StreamingOperatorBlueprint for StreamIngestionNode { + fn operator_identity(&self) -> Option { + Some(NamedNode::Source(self.source_identifier.clone())) + } + + fn compile_to_graph_node( + &self, + _compiler_context: &Planner, + node_id_sequence: usize, + upstream_schemas: Vec, + ) -> Result { + if !upstream_schemas.is_empty() { + return plan_err!( + "Topology Violation: StreamIngestionNode is a source origin and cannot process upstream routing edges." + ); + } + + let sql_source = self.source_definition.as_sql_source()?; + let connector_payload = sql_source.source.config.encode_to_vec(); + let operator_description = sql_source.source.config.description.clone(); + + let execution_unit = LogicalNode::single( + node_id_sequence as u32, + format!("source_{}_{node_id_sequence}", self.source_identifier), + OperatorName::ConnectorSource, + connector_payload, + operator_description, + 1, + ); + + Ok(CompiledTopologyNode::new(execution_unit, vec![])) + } + + fn yielded_schema(&self) -> FsSchema { + FsSchema::from_schema_keys(Arc::new(self.resolved_schema.as_ref().into()), vec![]) + .expect("Fatal: Failed to generate output schema for stream ingestion") + } +} diff --git a/src/sql/logical_node/timestamp_append.rs b/src/sql/logical_node/timestamp_append.rs new file mode 100644 index 00000000..630e5a66 --- /dev/null +++ b/src/sql/logical_node/timestamp_append.rs @@ -0,0 +1,121 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Formatter; + +use datafusion::common::{DFSchemaRef, Result, TableReference, internal_err}; +use datafusion::logical_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore}; + +use crate::multifield_partial_ord; +use crate::sql::common::constants::extension_node; +use crate::sql::schema::utils::{add_timestamp_field, has_timestamp_field}; + +// ----------------------------------------------------------------------------- +// Constants & Identifiers +// ----------------------------------------------------------------------------- + +pub(crate) const TIMESTAMP_INJECTOR_NODE_NAME: &str = extension_node::SYSTEM_TIMESTAMP_INJECTOR; + +// ----------------------------------------------------------------------------- +// Logical Node Definition +// ----------------------------------------------------------------------------- + +/// Injects the mandatory system `_timestamp` field into the upstream streaming schema. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) struct SystemTimestampInjectorNode { + pub(crate) upstream_plan: LogicalPlan, + pub(crate) target_qualifier: Option, + pub(crate) resolved_schema: DFSchemaRef, +} + +multifield_partial_ord!(SystemTimestampInjectorNode, upstream_plan, target_qualifier); + +impl SystemTimestampInjectorNode { + pub(crate) fn try_new( + upstream_plan: LogicalPlan, + target_qualifier: Option, + ) -> Result { + let upstream_schema = upstream_plan.schema(); + + if has_timestamp_field(upstream_schema) { + return internal_err!( + "Topology Violation: Attempted to inject a system timestamp into an upstream plan \ + that already contains one. \ + \nPlan:\n {:?} \nSchema:\n {:?}", + upstream_plan, + upstream_schema + ); + } + + let resolved_schema = + add_timestamp_field(upstream_schema.clone(), target_qualifier.clone())?; + + Ok(Self { + upstream_plan, + target_qualifier, + resolved_schema, + }) + } +} + +// ----------------------------------------------------------------------------- +// DataFusion Logical Node Hooks +// ----------------------------------------------------------------------------- + +impl UserDefinedLogicalNodeCore for SystemTimestampInjectorNode { + fn name(&self) -> &str { + TIMESTAMP_INJECTOR_NODE_NAME + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.upstream_plan] + } + + fn schema(&self) -> &DFSchemaRef { + &self.resolved_schema + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result { + let field_names = self + .resolved_schema + .fields() + .iter() + .map(|field| field.name().to_string()) + .collect::>() + .join(", "); + + write!( + f, + "SystemTimestampInjector(Qualifier={:?}): [{}]", + self.target_qualifier, field_names + ) + } + + fn with_exprs_and_inputs( + &self, + _exprs: Vec, + mut inputs: Vec, + ) -> Result { + if inputs.len() != 1 { + return internal_err!( + "SystemTimestampInjectorNode requires exactly 1 upstream logical plan, but received {}", + inputs.len() + ); + } + + Self::try_new(inputs.remove(0), self.target_qualifier.clone()) + } +} diff --git a/src/sql/logical_node/updating_aggregate.rs b/src/sql/logical_node/updating_aggregate.rs new file mode 100644 index 00000000..598d20eb --- /dev/null +++ b/src/sql/logical_node/updating_aggregate.rs @@ -0,0 +1,243 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; +use std::time::Duration; + +use datafusion::common::{DFSchemaRef, Result, TableReference, ToDFSchema, internal_err, plan_err}; +use datafusion::logical_expr::expr::ScalarFunction; +use datafusion::logical_expr::{ + Expr, Extension, LogicalPlan, UserDefinedLogicalNodeCore, col, lit, +}; +use datafusion::prelude::named_struct; +use datafusion::scalar::ScalarValue; +use datafusion_proto::physical_plan::AsExecutionPlan; +use datafusion_proto::protobuf::PhysicalPlanNode; +use prost::Message; +use protocol::function_stream_graph::UpdatingAggregateOperator; + +use crate::sql::common::constants::{extension_node, proto_operator_name, updating_state_field}; +use crate::sql::common::{FsSchema, FsSchemaRef}; +use crate::sql::functions::multi_hash; +use crate::sql::logical_node::logical::{LogicalEdge, LogicalEdgeType, LogicalNode, OperatorName}; +use crate::sql::logical_node::{ + CompiledTopologyNode, IsRetractExtension, StreamingOperatorBlueprint, +}; +use crate::sql::logical_planner::planner::{NamedNode, Planner}; +use crate::sql::physical::StreamingExtensionCodec; + +// ----------------------------------------------------------------------------- +// Constants & Configuration +// ----------------------------------------------------------------------------- + +pub(crate) const CONTINUOUS_AGGREGATE_NODE_NAME: &str = extension_node::CONTINUOUS_AGGREGATE; + +const DEFAULT_FLUSH_INTERVAL_MICROS: u64 = 10_000_000; + +const STATIC_HASH_SIZE_BYTES: i32 = 16; + +// ----------------------------------------------------------------------------- +// Logical Node Definition +// ----------------------------------------------------------------------------- + +/// Stateful continuous aggregation: running aggregates with updating / retraction semantics. +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd)] +pub(crate) struct ContinuousAggregateNode { + pub(crate) base_aggregate_plan: LogicalPlan, + pub(crate) partition_key_indices: Vec, + pub(crate) retract_injected_plan: LogicalPlan, + pub(crate) namespace_qualifier: Option, + pub(crate) state_retention_ttl: Duration, +} + +impl ContinuousAggregateNode { + pub fn try_new( + base_aggregate_plan: LogicalPlan, + partition_key_indices: Vec, + namespace_qualifier: Option, + state_retention_ttl: Duration, + ) -> Result { + let retract_injected_plan = LogicalPlan::Extension(Extension { + node: Arc::new(IsRetractExtension::new( + base_aggregate_plan.clone(), + namespace_qualifier.clone(), + )), + }); + + Ok(Self { + base_aggregate_plan, + partition_key_indices, + retract_injected_plan, + namespace_qualifier, + state_retention_ttl, + }) + } + + fn construct_state_metadata_expr(&self, upstream_schema: &FsSchemaRef) -> Expr { + let routing_keys: Vec = self + .partition_key_indices + .iter() + .map(|&idx| col(upstream_schema.schema.field(idx).name())) + .collect(); + + let state_id_hash = if routing_keys.is_empty() { + Expr::Literal( + ScalarValue::FixedSizeBinary( + STATIC_HASH_SIZE_BYTES, + Some(vec![0; STATIC_HASH_SIZE_BYTES as usize]), + ), + None, + ) + } else { + Expr::ScalarFunction(ScalarFunction { + func: multi_hash(), + args: routing_keys, + }) + }; + + named_struct(vec![ + lit(updating_state_field::IS_RETRACT), + lit(false), + lit(updating_state_field::ID), + state_id_hash, + ]) + } + + fn compile_operator_config( + &self, + planner: &Planner, + upstream_schema: &FsSchemaRef, + ) -> Result { + let upstream_df_schema = upstream_schema.schema.clone().to_dfschema()?; + + let physical_agg_plan = planner.sync_plan(&self.base_aggregate_plan)?; + let compiled_agg_payload = PhysicalPlanNode::try_from_physical_plan( + physical_agg_plan, + &StreamingExtensionCodec::default(), + )? + .encode_to_vec(); + + let meta_expr = self.construct_state_metadata_expr(upstream_schema); + let compiled_meta_expr = + planner.serialize_as_physical_expr(&meta_expr, &upstream_df_schema)?; + + Ok(UpdatingAggregateOperator { + name: proto_operator_name::UPDATING_AGGREGATE.to_string(), + input_schema: Some((**upstream_schema).clone().into()), + final_schema: Some(self.yielded_schema().into()), + aggregate_exec: compiled_agg_payload, + metadata_expr: compiled_meta_expr, + flush_interval_micros: DEFAULT_FLUSH_INTERVAL_MICROS, + ttl_micros: self.state_retention_ttl.as_micros() as u64, + }) + } +} + +// ----------------------------------------------------------------------------- +// DataFusion Logical Node Hooks +// ----------------------------------------------------------------------------- + +impl UserDefinedLogicalNodeCore for ContinuousAggregateNode { + fn name(&self) -> &str { + CONTINUOUS_AGGREGATE_NODE_NAME + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.base_aggregate_plan] + } + + fn schema(&self) -> &DFSchemaRef { + self.retract_injected_plan.schema() + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn fmt_for_explain(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!( + f, + "ContinuousAggregateNode(TTL={:?})", + self.state_retention_ttl + ) + } + + fn with_exprs_and_inputs( + &self, + _exprs: Vec, + mut inputs: Vec, + ) -> Result { + if inputs.len() != 1 { + return internal_err!( + "ContinuousAggregateNode requires exactly 1 upstream input, got {}", + inputs.len() + ); + } + + Self::try_new( + inputs.remove(0), + self.partition_key_indices.clone(), + self.namespace_qualifier.clone(), + self.state_retention_ttl, + ) + } +} + +// ----------------------------------------------------------------------------- +// Core Execution Blueprint Implementation +// ----------------------------------------------------------------------------- + +impl StreamingOperatorBlueprint for ContinuousAggregateNode { + fn operator_identity(&self) -> Option { + None + } + + fn compile_to_graph_node( + &self, + planner: &Planner, + node_index: usize, + mut upstream_schemas: Vec, + ) -> Result { + if upstream_schemas.len() != 1 { + return plan_err!( + "Topology Violation: ContinuousAggregateNode requires exactly 1 upstream input, received {}", + upstream_schemas.len() + ); + } + + let upstream_schema = upstream_schemas.remove(0); + + let operator_config = self.compile_operator_config(planner, &upstream_schema)?; + + let logical_node = LogicalNode::single( + node_index as u32, + format!("updating_aggregate_{node_index}"), + OperatorName::UpdatingAggregate, + operator_config.encode_to_vec(), + proto_operator_name::UPDATING_AGGREGATE.to_string(), + 1, + ); + + let shuffle_edge = + LogicalEdge::project_all(LogicalEdgeType::Shuffle, (*upstream_schema).clone()); + + Ok(CompiledTopologyNode { + execution_unit: logical_node, + routing_edges: vec![shuffle_edge], + }) + } + + fn yielded_schema(&self) -> FsSchema { + FsSchema::from_schema_unkeyed(Arc::new(self.schema().as_ref().into())) + .expect("Fatal: Failed to generate unkeyed output schema for continuous aggregate") + } +} diff --git a/src/sql/logical_node/watermark_node.rs b/src/sql/logical_node/watermark_node.rs new file mode 100644 index 00000000..7c83c429 --- /dev/null +++ b/src/sql/logical_node/watermark_node.rs @@ -0,0 +1,229 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Formatter; +use std::sync::Arc; + +use datafusion::common::{DFSchemaRef, Result, TableReference, internal_err, plan_err}; +use datafusion::error::DataFusionError; +use datafusion::logical_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore}; +use datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec; +use datafusion_proto::physical_plan::to_proto::serialize_physical_expr; +use prost::Message; +use protocol::function_stream_graph::ExpressionWatermarkConfig; + +use crate::multifield_partial_ord; +use crate::sql::common::constants::{extension_node, runtime_operator_kind}; +use crate::sql::common::{FsSchema, FsSchemaRef}; +use crate::sql::logical_node::logical::{LogicalEdge, LogicalEdgeType, LogicalNode, OperatorName}; +use crate::sql::logical_node::{CompiledTopologyNode, StreamingOperatorBlueprint}; +use crate::sql::logical_planner::planner::{NamedNode, Planner}; +use crate::sql::schema::utils::add_timestamp_field; +use crate::sql::types::TIMESTAMP_FIELD; + +// ----------------------------------------------------------------------------- +// Constants & Identifiers +// ----------------------------------------------------------------------------- + +pub(crate) const EVENT_TIME_WATERMARK_NODE_NAME: &str = extension_node::EVENT_TIME_WATERMARK; + +const DEFAULT_WATERMARK_EMISSION_PERIOD_MICROS: u64 = 1_000_000; + +// ----------------------------------------------------------------------------- +// Logical Node Definition +// ----------------------------------------------------------------------------- + +/// Event-time watermark from a user strategy; drives time progress in stateful operators. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) struct EventTimeWatermarkNode { + pub(crate) upstream_plan: LogicalPlan, + pub(crate) namespace_qualifier: TableReference, + pub(crate) watermark_strategy_expr: Expr, + pub(crate) resolved_schema: DFSchemaRef, + pub(crate) internal_timestamp_offset: usize, +} + +multifield_partial_ord!( + EventTimeWatermarkNode, + upstream_plan, + namespace_qualifier, + watermark_strategy_expr, + internal_timestamp_offset +); + +impl EventTimeWatermarkNode { + pub(crate) fn try_new( + upstream_plan: LogicalPlan, + namespace_qualifier: TableReference, + watermark_strategy_expr: Expr, + ) -> Result { + let resolved_schema = add_timestamp_field( + upstream_plan.schema().clone(), + Some(namespace_qualifier.clone()), + )?; + + let internal_timestamp_offset = resolved_schema + .index_of_column_by_name(None, TIMESTAMP_FIELD) + .ok_or_else(|| { + DataFusionError::Plan(format!( + "Fatal: Failed to resolve mandatory temporal column '{}'", + TIMESTAMP_FIELD + )) + })?; + + Ok(Self { + upstream_plan, + namespace_qualifier, + watermark_strategy_expr, + resolved_schema, + internal_timestamp_offset, + }) + } + + pub(crate) fn generate_fs_schema(&self) -> FsSchema { + FsSchema::new_unkeyed( + Arc::new(self.resolved_schema.as_ref().into()), + self.internal_timestamp_offset, + ) + } + + fn compile_operator_config(&self, planner: &Planner) -> Result { + let physical_expr = + planner.create_physical_expr(&self.watermark_strategy_expr, &self.resolved_schema)?; + + let serialized_expr = + serialize_physical_expr(&physical_expr, &DefaultPhysicalExtensionCodec {})?; + + Ok(ExpressionWatermarkConfig { + period_micros: DEFAULT_WATERMARK_EMISSION_PERIOD_MICROS, + idle_time_micros: None, + expression: serialized_expr.encode_to_vec(), + input_schema: Some(self.generate_fs_schema().into()), + }) + } +} + +// ----------------------------------------------------------------------------- +// DataFusion Logical Node Hooks +// ----------------------------------------------------------------------------- + +impl UserDefinedLogicalNodeCore for EventTimeWatermarkNode { + fn name(&self) -> &str { + EVENT_TIME_WATERMARK_NODE_NAME + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.upstream_plan] + } + + fn schema(&self) -> &DFSchemaRef { + &self.resolved_schema + } + + fn expressions(&self) -> Vec { + vec![self.watermark_strategy_expr.clone()] + } + + fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result { + write!( + f, + "EventTimeWatermarkNode({}): Schema={}", + self.namespace_qualifier, self.resolved_schema + ) + } + + fn with_exprs_and_inputs( + &self, + mut exprs: Vec, + mut inputs: Vec, + ) -> Result { + if inputs.len() != 1 { + return internal_err!( + "EventTimeWatermarkNode requires exactly 1 upstream logical plan, but received {}", + inputs.len() + ); + } + if exprs.len() != 1 { + return internal_err!( + "EventTimeWatermarkNode requires exactly 1 watermark strategy expression, but received {}", + exprs.len() + ); + } + + let internal_timestamp_offset = self + .resolved_schema + .index_of_column_by_name(Some(&self.namespace_qualifier), TIMESTAMP_FIELD) + .ok_or_else(|| { + DataFusionError::Plan(format!( + "Optimizer Error: Lost tracking of temporal column '{}'", + TIMESTAMP_FIELD + )) + })?; + + Ok(Self { + upstream_plan: inputs.remove(0), + namespace_qualifier: self.namespace_qualifier.clone(), + watermark_strategy_expr: exprs.remove(0), + resolved_schema: self.resolved_schema.clone(), + internal_timestamp_offset, + }) + } +} + +// ----------------------------------------------------------------------------- +// Core Execution Blueprint Implementation +// ----------------------------------------------------------------------------- + +impl StreamingOperatorBlueprint for EventTimeWatermarkNode { + fn operator_identity(&self) -> Option { + Some(NamedNode::Watermark(self.namespace_qualifier.clone())) + } + + fn compile_to_graph_node( + &self, + planner: &Planner, + node_index: usize, + mut upstream_schemas: Vec, + ) -> Result { + if upstream_schemas.len() != 1 { + return plan_err!( + "Topology Violation: EventTimeWatermarkNode requires exactly 1 upstream input, received {}", + upstream_schemas.len() + ); + } + + let operator_config = self.compile_operator_config(planner)?; + + let execution_unit = LogicalNode::single( + node_index as u32, + format!("watermark_{node_index}"), + OperatorName::ExpressionWatermark, + operator_config.encode_to_vec(), + runtime_operator_kind::WATERMARK_GENERATOR.to_string(), + 1, + ); + + let incoming_edge = LogicalEdge::project_all( + LogicalEdgeType::Forward, + (*upstream_schemas.remove(0)).clone(), + ); + + Ok(CompiledTopologyNode { + execution_unit, + routing_edges: vec![incoming_edge], + }) + } + + fn yielded_schema(&self) -> FsSchema { + self.generate_fs_schema() + } +} diff --git a/src/sql/logical_node/windows_function.rs b/src/sql/logical_node/windows_function.rs new file mode 100644 index 00000000..a79ceff3 --- /dev/null +++ b/src/sql/logical_node/windows_function.rs @@ -0,0 +1,189 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Formatter; +use std::sync::Arc; + +use datafusion::common::{Column, DFSchema, DFSchemaRef, Result, internal_err, plan_err}; +use datafusion::logical_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore}; +use datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec; +use datafusion_proto::physical_plan::to_proto::serialize_physical_expr; +use datafusion_proto::{physical_plan::AsExecutionPlan, protobuf::PhysicalPlanNode}; +use prost::Message; +use protocol::function_stream_graph::WindowFunctionOperator; + +use crate::sql::common::constants::{extension_node, proto_operator_name, runtime_operator_kind}; +use crate::sql::common::{FsSchema, FsSchemaRef}; +use crate::sql::logical_node::logical::{LogicalEdge, LogicalEdgeType, LogicalNode, OperatorName}; +use crate::sql::logical_planner::planner::{NamedNode, Planner}; +use crate::sql::physical::StreamingExtensionCodec; +use crate::sql::types::TIMESTAMP_FIELD; + +use super::{CompiledTopologyNode, StreamingOperatorBlueprint}; + +// ----------------------------------------------------------------------------- +// Constants & Identifiers +// ----------------------------------------------------------------------------- + +pub(crate) const STREAMING_WINDOW_NODE_NAME: &str = extension_node::STREAMING_WINDOW_FUNCTION; + +// ----------------------------------------------------------------------------- +// Logical Node Definition +// ----------------------------------------------------------------------------- + +/// Stateful streaming window: temporal binning plus underlying window evaluation plan. +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd)] +pub(crate) struct StreamingWindowFunctionNode { + pub(crate) underlying_evaluation_plan: LogicalPlan, + pub(crate) partition_key_indices: Vec, +} + +impl StreamingWindowFunctionNode { + pub fn new(underlying_evaluation_plan: LogicalPlan, partition_key_indices: Vec) -> Self { + Self { + underlying_evaluation_plan, + partition_key_indices, + } + } + + fn compile_temporal_binning_function( + &self, + planner: &Planner, + input_df_schema: &DFSchema, + ) -> Result> { + let timestamp_column = Expr::Column(Column::new_unqualified(TIMESTAMP_FIELD.to_string())); + + let physical_binning_expr = + planner.create_physical_expr(×tamp_column, input_df_schema)?; + + let serialized_expr = + serialize_physical_expr(&physical_binning_expr, &DefaultPhysicalExtensionCodec {})?; + + Ok(serialized_expr.encode_to_vec()) + } + + fn compile_physical_evaluation_plan(&self, planner: &Planner) -> Result> { + let physical_window_plan = planner.sync_plan(&self.underlying_evaluation_plan)?; + + let proto_plan_node = PhysicalPlanNode::try_from_physical_plan( + physical_window_plan, + &StreamingExtensionCodec::default(), + )?; + + Ok(proto_plan_node.encode_to_vec()) + } +} + +// ----------------------------------------------------------------------------- +// DataFusion Logical Node Hooks +// ----------------------------------------------------------------------------- + +impl UserDefinedLogicalNodeCore for StreamingWindowFunctionNode { + fn name(&self) -> &str { + STREAMING_WINDOW_NODE_NAME + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.underlying_evaluation_plan] + } + + fn schema(&self) -> &DFSchemaRef { + self.underlying_evaluation_plan.schema() + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result { + write!(f, "StreamingWindowFunction: Schema={}", self.schema()) + } + + fn with_exprs_and_inputs( + &self, + _exprs: Vec, + mut inputs: Vec, + ) -> Result { + if inputs.len() != 1 { + return internal_err!( + "StreamingWindowFunctionNode requires exactly 1 upstream input, got {}", + inputs.len() + ); + } + + Ok(Self::new( + inputs.remove(0), + self.partition_key_indices.clone(), + )) + } +} + +// ----------------------------------------------------------------------------- +// Core Execution Blueprint Implementation +// ----------------------------------------------------------------------------- + +impl StreamingOperatorBlueprint for StreamingWindowFunctionNode { + fn operator_identity(&self) -> Option { + None + } + + fn compile_to_graph_node( + &self, + planner: &Planner, + node_index: usize, + mut input_schemas: Vec, + ) -> Result { + if input_schemas.len() != 1 { + return plan_err!( + "Topology Violation: StreamingWindowFunctionNode requires exactly 1 upstream input schema, received {}", + input_schemas.len() + ); + } + + let input_schema = input_schemas.remove(0); + + let input_df_schema = DFSchema::try_from(input_schema.schema.as_ref().clone())?; + + let binning_payload = self.compile_temporal_binning_function(planner, &input_df_schema)?; + let evaluation_plan_payload = self.compile_physical_evaluation_plan(planner)?; + + let operator_config = WindowFunctionOperator { + name: proto_operator_name::WINDOW_FUNCTION.to_string(), + input_schema: Some(input_schema.as_ref().clone().into()), + binning_function: binning_payload, + window_function_plan: evaluation_plan_payload, + }; + + let logical_node = LogicalNode::single( + node_index as u32, + format!("window_function_{node_index}"), + OperatorName::WindowFunction, + operator_config.encode_to_vec(), + runtime_operator_kind::STREAMING_WINDOW_EVALUATOR.to_string(), + 1, + ); + + let routing_edge = + LogicalEdge::project_all(LogicalEdgeType::Shuffle, (*input_schema).clone()); + + Ok(CompiledTopologyNode { + execution_unit: logical_node, + routing_edges: vec![routing_edge], + }) + } + + fn yielded_schema(&self) -> FsSchema { + FsSchema::from_schema_unkeyed(Arc::new(self.schema().as_ref().clone().into())).expect( + "Fatal: Failed to generate unkeyed output schema for StreamingWindowFunctionNode", + ) + } +} diff --git a/src/sql/logical_planner/mod.rs b/src/sql/logical_planner/mod.rs new file mode 100644 index 00000000..9ecfb676 --- /dev/null +++ b/src/sql/logical_planner/mod.rs @@ -0,0 +1,16 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod optimizers; + +pub(crate) mod streaming_planner; +pub(crate) use streaming_planner as planner; diff --git a/src/sql/logical_planner/optimizers/chaining.rs b/src/sql/logical_planner/optimizers/chaining.rs new file mode 100644 index 00000000..8260df19 --- /dev/null +++ b/src/sql/logical_planner/optimizers/chaining.rs @@ -0,0 +1,168 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use petgraph::prelude::*; +use petgraph::visit::NodeIndexable; +use tracing::debug; + +use crate::sql::logical_node::logical::{LogicalEdgeType, LogicalGraph, Optimizer}; + +pub struct ChainingOptimizer {} + +impl Optimizer for ChainingOptimizer { + fn optimize_once(&self, plan: &mut LogicalGraph) -> bool { + let mut match_found = None; + + for node_idx in plan.node_indices() { + let mut outgoing = plan.edges_directed(node_idx, Outgoing); + let first_out = outgoing.next(); + if first_out.is_none() || outgoing.next().is_some() { + continue; + } + let edge = first_out.unwrap(); + + if edge.weight().edge_type != LogicalEdgeType::Forward { + continue; + } + + let target_idx = edge.target(); + + let mut incoming = plan.edges_directed(target_idx, Incoming); + let first_in = incoming.next(); + if first_in.is_none() || incoming.next().is_some() { + continue; + } + + let source_node = plan.node_weight(node_idx).expect("Source node missing"); + let target_node = plan.node_weight(target_idx).expect("Target node missing"); + + if source_node.operator_chain.is_source() + || target_node.operator_chain.is_sink() + || source_node.parallelism != target_node.parallelism + { + continue; + } + + match_found = Some((node_idx, target_idx, edge.id())); + break; + } + + if let Some((source_idx, target_idx, edge_id)) = match_found { + let edge_weight = plan.remove_edge(edge_id).expect("Edge should exist"); + + let target_outgoing: Vec<_> = plan + .edges_directed(target_idx, Outgoing) + .map(|e| (e.id(), e.target())) + .collect(); + + for (e_id, next_target_idx) in target_outgoing { + let weight = plan.remove_edge(e_id).expect("Outgoing edge missing"); + plan.add_edge(source_idx, next_target_idx, weight); + } + + let is_source_last = source_idx.index() == plan.node_bound() - 1; + + let target_node = plan + .remove_node(target_idx) + .expect("Target node should exist"); + + let actual_source_idx = if is_source_last { + target_idx + } else { + source_idx + }; + + let source_node = plan + .node_weight_mut(actual_source_idx) + .expect("Source node missing"); + + debug!( + "Chaining Optimizer: Fusing '{}' -> '{}'", + source_node.description, target_node.description + ); + + source_node.description = + format!("{} -> {}", source_node.description, target_node.description); + + source_node + .operator_chain + .operators + .extend(target_node.operator_chain.operators); + source_node.operator_chain.edges.push(edge_weight.schema); + + return true; + } + + false + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use datafusion::arrow::datatypes::{DataType, Field, Schema, TimeUnit}; + + use crate::sql::common::FsSchema; + use crate::sql::logical_node::logical::{ + LogicalEdge, LogicalEdgeType, LogicalGraph, LogicalNode, OperatorName, Optimizer, + }; + + use super::ChainingOptimizer; + + fn forward_edge() -> LogicalEdge { + let s = Arc::new(Schema::new(vec![Field::new( + "_timestamp", + DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + )])); + LogicalEdge::new(LogicalEdgeType::Forward, FsSchema::new_unkeyed(s, 0)) + } + + fn proj_node(id: u32, label: &str) -> LogicalNode { + LogicalNode::single( + id, + format!("op_{label}"), + OperatorName::Projection, + vec![], + label.to_string(), + 1, + ) + } + + fn source_node() -> LogicalNode { + LogicalNode::single( + 0, + "src".into(), + OperatorName::ConnectorSource, + vec![], + "source".into(), + 1, + ) + } + + /// Regression: upstream at last `NodeIndex` + remove non-last downstream swaps indices. + #[test] + fn fusion_remaps_when_upstream_was_last_node_index() { + let mut g = LogicalGraph::new(); + let n0 = g.add_node(source_node()); + let n1 = g.add_node(proj_node(1, "downstream")); + let n2 = g.add_node(proj_node(2, "upstream_last_index")); + let e = forward_edge(); + g.add_edge(n0, n2, e.clone()); + g.add_edge(n2, n1, e); + + let changed = ChainingOptimizer {}.optimize_once(&mut g); + assert!(changed); + assert_eq!(g.node_count(), 2); + } +} diff --git a/src/sql/logical_planner/optimizers/mod.rs b/src/sql/logical_planner/optimizers/mod.rs new file mode 100644 index 00000000..c7981313 --- /dev/null +++ b/src/sql/logical_planner/optimizers/mod.rs @@ -0,0 +1,20 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Logical planner optimizers: graph-level chaining ([`ChainingOptimizer`]) and +//! DataFusion SQL logical-plan rules ([`produce_optimized_plan`]). + +mod chaining; +mod optimized_plan; + +pub use chaining::ChainingOptimizer; +pub use optimized_plan::produce_optimized_plan; diff --git a/src/sql/logical_planner/optimizers/optimized_plan.rs b/src/sql/logical_planner/optimizers/optimized_plan.rs new file mode 100644 index 00000000..fbb64845 --- /dev/null +++ b/src/sql/logical_planner/optimizers/optimized_plan.rs @@ -0,0 +1,95 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use datafusion::common::Result; +use datafusion::common::config::ConfigOptions; +use datafusion::logical_expr::LogicalPlan; +use datafusion::optimizer::OptimizerContext; +use datafusion::optimizer::OptimizerRule; +use datafusion::optimizer::common_subexpr_eliminate::CommonSubexprEliminate; +use datafusion::optimizer::decorrelate_lateral_join::DecorrelateLateralJoin; +use datafusion::optimizer::decorrelate_predicate_subquery::DecorrelatePredicateSubquery; +use datafusion::optimizer::eliminate_cross_join::EliminateCrossJoin; +use datafusion::optimizer::eliminate_duplicated_expr::EliminateDuplicatedExpr; +use datafusion::optimizer::eliminate_filter::EliminateFilter; +use datafusion::optimizer::eliminate_group_by_constant::EliminateGroupByConstant; +use datafusion::optimizer::eliminate_join::EliminateJoin; +use datafusion::optimizer::eliminate_limit::EliminateLimit; +use datafusion::optimizer::eliminate_nested_union::EliminateNestedUnion; +use datafusion::optimizer::eliminate_one_union::EliminateOneUnion; +use datafusion::optimizer::eliminate_outer_join::EliminateOuterJoin; +use datafusion::optimizer::extract_equijoin_predicate::ExtractEquijoinPredicate; +use datafusion::optimizer::filter_null_join_keys::FilterNullJoinKeys; +use datafusion::optimizer::optimizer::Optimizer; +use datafusion::optimizer::propagate_empty_relation::PropagateEmptyRelation; +use datafusion::optimizer::push_down_filter::PushDownFilter; +use datafusion::optimizer::push_down_limit::PushDownLimit; +use datafusion::optimizer::replace_distinct_aggregate::ReplaceDistinctWithAggregate; +use datafusion::optimizer::scalar_subquery_to_join::ScalarSubqueryToJoin; +use datafusion::optimizer::simplify_expressions::SimplifyExpressions; +use datafusion::sql::planner::SqlToRel; +use datafusion::sql::sqlparser::ast::Statement; + +use crate::sql::schema::StreamSchemaProvider; + +/// Converts a SQL statement into an optimized DataFusion logical plan. +/// +/// Applies the DataFusion analyzer followed by a curated set of optimizer rules +/// suitable for streaming SQL (some rules like OptimizeProjections are excluded +/// because they can drop event-time calculation fields). +pub fn produce_optimized_plan( + statement: &Statement, + schema_provider: &StreamSchemaProvider, +) -> Result { + let sql_to_rel = SqlToRel::new(schema_provider); + let plan = sql_to_rel.sql_statement_to_plan(statement.clone())?; + + let analyzed_plan = schema_provider.analyzer.execute_and_check( + plan, + &ConfigOptions::default(), + |_plan, _rule| {}, + )?; + + let rules: Vec> = vec![ + Arc::new(EliminateNestedUnion::new()), + Arc::new(SimplifyExpressions::new()), + Arc::new(ReplaceDistinctWithAggregate::new()), + Arc::new(EliminateJoin::new()), + Arc::new(DecorrelatePredicateSubquery::new()), + Arc::new(ScalarSubqueryToJoin::new()), + Arc::new(DecorrelateLateralJoin::new()), + Arc::new(ExtractEquijoinPredicate::new()), + Arc::new(EliminateDuplicatedExpr::new()), + Arc::new(EliminateFilter::new()), + Arc::new(EliminateCrossJoin::new()), + Arc::new(EliminateLimit::new()), + Arc::new(PropagateEmptyRelation::new()), + Arc::new(EliminateOneUnion::new()), + Arc::new(FilterNullJoinKeys::default()), + Arc::new(EliminateOuterJoin::new()), + Arc::new(PushDownLimit::new()), + Arc::new(PushDownFilter::new()), + Arc::new(EliminateGroupByConstant::new()), + Arc::new(CommonSubexprEliminate::new()), + ]; + + let optimizer = Optimizer::with_rules(rules); + let optimized = optimizer.optimize( + analyzed_plan, + &OptimizerContext::default(), + |_plan, _rule| {}, + )?; + + Ok(optimized) +} diff --git a/src/sql/logical_planner/streaming_planner.rs b/src/sql/logical_planner/streaming_planner.rs new file mode 100644 index 00000000..e501695d --- /dev/null +++ b/src/sql/logical_planner/streaming_planner.rs @@ -0,0 +1,418 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::sync::Arc; +use std::thread; +use std::time::Duration; + +use datafusion::arrow::datatypes::IntervalMonthDayNanoType; +use datafusion::common::tree_node::{TreeNode, TreeNodeRecursion, TreeNodeVisitor}; +use datafusion::common::{ + DFSchema, DFSchemaRef, DataFusionError, Result, ScalarValue, Spans, plan_err, +}; +use datafusion::execution::context::SessionState; +use datafusion::execution::runtime_env::RuntimeEnvBuilder; +use datafusion::functions::datetime::date_bin; +use datafusion::logical_expr::{Expr, Extension, LogicalPlan, UserDefinedLogicalNode}; +use datafusion::physical_expr::PhysicalExpr; +use datafusion::physical_plan::ExecutionPlan; +use datafusion::physical_planner::{DefaultPhysicalPlanner, ExtensionPlanner, PhysicalPlanner}; +use datafusion_proto::protobuf::{PhysicalExprNode, PhysicalPlanNode}; +use datafusion_proto::{ + physical_plan::AsExecutionPlan, + protobuf::{AggregateMode, physical_plan_node::PhysicalPlanType}, +}; +use petgraph::graph::{DiGraph, NodeIndex}; +use prost::Message; +use tokio::runtime::Builder; +use tokio::sync::oneshot; + +use async_trait::async_trait; +use datafusion_common::TableReference; +use datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec; +use datafusion_proto::physical_plan::to_proto::serialize_physical_expr; + +use crate::sql::common::{FsSchema, FsSchemaRef}; +use crate::sql::logical_node::debezium::{ + PACK_NODE_NAME, UNROLL_NODE_NAME, UnrollDebeziumPayloadNode, +}; +use crate::sql::logical_node::key_calculation::KeyExtractionNode; +use crate::sql::logical_node::logical::{LogicalEdge, LogicalGraph, LogicalNode}; +use crate::sql::logical_node::{CompiledTopologyNode, StreamingOperatorBlueprint}; +use crate::sql::physical::{ + CdcDebeziumPackExec, CdcDebeziumUnrollExec, FsMemExec, StreamingDecodingContext, + StreamingExtensionCodec, +}; +use crate::sql::schema::StreamSchemaProvider; +use crate::sql::schema::utils::add_timestamp_field_arrow; + +pub(crate) struct SplitPlanOutput { + pub(crate) partial_aggregation_plan: PhysicalPlanNode, + pub(crate) partial_schema: FsSchema, + pub(crate) finish_plan: PhysicalPlanNode, +} +#[derive(Eq, Hash, PartialEq, Debug)] +pub(crate) enum NamedNode { + Source(TableReference), + Watermark(TableReference), + RemoteTable(TableReference), + Sink(TableReference), +} + +pub(crate) struct PlanToGraphVisitor<'a> { + graph: DiGraph, + output_schemas: HashMap, + named_nodes: HashMap, + traversal: Vec>, + planner: Planner<'a>, +} + +impl<'a> PlanToGraphVisitor<'a> { + pub fn new(schema_provider: &'a StreamSchemaProvider, session_state: &'a SessionState) -> Self { + Self { + graph: Default::default(), + output_schemas: Default::default(), + named_nodes: Default::default(), + traversal: vec![], + planner: Planner::new(schema_provider, session_state), + } + } +} + +pub(crate) struct Planner<'a> { + schema_provider: &'a StreamSchemaProvider, + planner: DefaultPhysicalPlanner, + session_state: &'a SessionState, +} + +impl<'a> Planner<'a> { + pub(crate) fn new( + schema_provider: &'a StreamSchemaProvider, + session_state: &'a SessionState, + ) -> Self { + let planner = + DefaultPhysicalPlanner::with_extension_planners(vec![Arc::new(FsExtensionPlanner {})]); + Self { + schema_provider, + planner, + session_state, + } + } + + pub(crate) fn sync_plan(&self, plan: &LogicalPlan) -> Result> { + let fut = self.planner.create_physical_plan(plan, self.session_state); + let (tx, mut rx) = oneshot::channel(); + thread::scope(|s| { + let builder = thread::Builder::new(); + let builder = if cfg!(debug_assertions) { + builder.stack_size(10_000_000) + } else { + builder + }; + builder + .spawn_scoped(s, move || { + let rt = Builder::new_current_thread().enable_all().build().unwrap(); + rt.block_on(async { + let plan = fut.await; + tx.send(plan).unwrap(); + }); + }) + .unwrap(); + }); + + rx.try_recv().unwrap() + } + + pub(crate) fn create_physical_expr( + &self, + expr: &Expr, + input_dfschema: &DFSchema, + ) -> Result> { + self.planner + .create_physical_expr(expr, input_dfschema, self.session_state) + } + + pub(crate) fn serialize_as_physical_expr( + &self, + expr: &Expr, + schema: &DFSchema, + ) -> Result> { + let physical = self.create_physical_expr(expr, schema)?; + let proto = serialize_physical_expr(&physical, &DefaultPhysicalExtensionCodec {})?; + Ok(proto.encode_to_vec()) + } + + pub(crate) fn split_physical_plan( + &self, + key_indices: Vec, + aggregate: &LogicalPlan, + add_timestamp_field: bool, + ) -> Result { + let physical_plan = self.sync_plan(aggregate)?; + let codec = StreamingExtensionCodec { + context: StreamingDecodingContext::Planning, + }; + let mut physical_plan_node = + PhysicalPlanNode::try_from_physical_plan(physical_plan.clone(), &codec)?; + let PhysicalPlanType::Aggregate(mut final_aggregate_proto) = physical_plan_node + .physical_plan_type + .take() + .ok_or_else(|| DataFusionError::Plan("missing physical plan type".to_string()))? + else { + return plan_err!("unexpected physical plan type"); + }; + let AggregateMode::Final = final_aggregate_proto.mode() else { + return plan_err!("unexpected physical plan type"); + }; + + let partial_aggregation_plan = *final_aggregate_proto + .input + .take() + .ok_or_else(|| DataFusionError::Plan("missing input".to_string()))?; + + let partial_aggregation_exec_plan = partial_aggregation_plan.try_into_physical_plan( + self.schema_provider, + &RuntimeEnvBuilder::new().build().unwrap(), + &codec, + )?; + + let partial_schema = partial_aggregation_exec_plan.schema(); + let final_input_table_provider = FsMemExec::new("partial".into(), partial_schema.clone()); + + final_aggregate_proto.input = Some(Box::new(PhysicalPlanNode::try_from_physical_plan( + Arc::new(final_input_table_provider), + &codec, + )?)); + + let finish_plan = PhysicalPlanNode { + physical_plan_type: Some(PhysicalPlanType::Aggregate(final_aggregate_proto)), + }; + + let (partial_schema, timestamp_index) = if add_timestamp_field { + ( + add_timestamp_field_arrow((*partial_schema).clone()), + partial_schema.fields().len(), + ) + } else { + (partial_schema.clone(), partial_schema.fields().len() - 1) + }; + + let partial_schema = FsSchema::new_keyed(partial_schema, timestamp_index, key_indices); + + Ok(SplitPlanOutput { + partial_aggregation_plan, + partial_schema, + finish_plan, + }) + } + + pub fn binning_function_proto( + &self, + width: Duration, + input_schema: DFSchemaRef, + ) -> Result { + let date_bin = date_bin().call(vec![ + Expr::Literal( + ScalarValue::IntervalMonthDayNano(Some(IntervalMonthDayNanoType::make_value( + 0, + 0, + width.as_nanos() as i64, + ))), + None, + ), + Expr::Column(datafusion::common::Column { + relation: None, + name: "_timestamp".into(), + spans: Spans::new(), + }), + ]); + + let binning_function = self.create_physical_expr(&date_bin, &input_schema)?; + serialize_physical_expr(&binning_function, &DefaultPhysicalExtensionCodec {}) + } +} + +struct FsExtensionPlanner {} + +#[async_trait] +impl ExtensionPlanner for FsExtensionPlanner { + async fn plan_extension( + &self, + _planner: &dyn PhysicalPlanner, + node: &dyn UserDefinedLogicalNode, + _logical_inputs: &[&LogicalPlan], + physical_inputs: &[Arc], + _session_state: &SessionState, + ) -> Result>> { + let schema = node.schema().as_ref().into(); + if let Ok::<&dyn StreamingOperatorBlueprint, _>(stream_extension) = node.try_into() + && stream_extension.is_passthrough_boundary() + { + match node.name() { + UNROLL_NODE_NAME => { + let node = node + .as_any() + .downcast_ref::() + .unwrap(); + let input = physical_inputs[0].clone(); + return Ok(Some(Arc::new(CdcDebeziumUnrollExec::try_new( + input, + node.pk_indices.clone(), + )?))); + } + PACK_NODE_NAME => { + let input = physical_inputs[0].clone(); + return Ok(Some(Arc::new(CdcDebeziumPackExec::try_new(input)?))); + } + _ => return Ok(None), + } + } + let name = if let Some(key_extension) = node.as_any().downcast_ref::() { + key_extension.operator_label.clone() + } else { + None + }; + Ok(Some(Arc::new(FsMemExec::new( + name.unwrap_or("memory".to_string()), + Arc::new(schema), + )))) + } +} + +impl PlanToGraphVisitor<'_> { + fn add_index_to_traversal(&mut self, index: NodeIndex) { + if let Some(last) = self.traversal.last_mut() { + last.push(index); + } + } + + pub(crate) fn add_plan(&mut self, plan: LogicalPlan) -> Result<()> { + self.traversal.clear(); + plan.visit(self)?; + Ok(()) + } + + pub fn into_graph(self) -> LogicalGraph { + self.graph + } + + pub fn build_extension( + &mut self, + input_nodes: Vec, + extension: &dyn StreamingOperatorBlueprint, + ) -> Result<()> { + if let Some(node_name) = extension.operator_identity() + && self.named_nodes.contains_key(&node_name) + { + return plan_err!( + "extension {:?} has already been planned, shouldn't try again.", + node_name + ); + } + + let input_schemas = input_nodes + .iter() + .map(|index| { + Ok(self + .output_schemas + .get(index) + .ok_or_else(|| DataFusionError::Plan("missing input node".to_string()))? + .clone()) + }) + .collect::>>()?; + + let CompiledTopologyNode { + execution_unit, + routing_edges, + } = extension + .compile_to_graph_node(&self.planner, self.graph.node_count(), input_schemas) + .map_err(|e| e.context(format!("planning operator {extension:?}")))?; + + let node_index = self.graph.add_node(execution_unit); + self.add_index_to_traversal(node_index); + + for (source, edge) in input_nodes.into_iter().zip(routing_edges.into_iter()) { + self.graph.add_edge(source, node_index, edge); + } + + self.output_schemas + .insert(node_index, extension.yielded_schema().into()); + + if let Some(node_name) = extension.operator_identity() { + self.named_nodes.insert(node_name, node_index); + } + Ok(()) + } +} + +impl TreeNodeVisitor<'_> for PlanToGraphVisitor<'_> { + type Node = LogicalPlan; + + fn f_down(&mut self, node: &Self::Node) -> Result { + let LogicalPlan::Extension(Extension { node }) = node else { + return Ok(TreeNodeRecursion::Continue); + }; + + let stream_extension: &dyn StreamingOperatorBlueprint = node + .try_into() + .map_err(|e: DataFusionError| e.context("converting extension"))?; + if stream_extension.is_passthrough_boundary() { + return Ok(TreeNodeRecursion::Continue); + } + + if let Some(name) = stream_extension.operator_identity() + && let Some(node_index) = self.named_nodes.get(&name) + { + self.add_index_to_traversal(*node_index); + return Ok(TreeNodeRecursion::Jump); + } + + if !node.inputs().is_empty() { + self.traversal.push(vec![]); + } + + Ok(TreeNodeRecursion::Continue) + } + + fn f_up(&mut self, node: &Self::Node) -> Result { + let LogicalPlan::Extension(Extension { node }) = node else { + return Ok(TreeNodeRecursion::Continue); + }; + + let stream_extension: &dyn StreamingOperatorBlueprint = node + .try_into() + .map_err(|e: DataFusionError| e.context("planning extension"))?; + + if stream_extension.is_passthrough_boundary() { + return Ok(TreeNodeRecursion::Continue); + } + + if let Some(name) = stream_extension.operator_identity() + && self.named_nodes.contains_key(&name) + { + return Ok(TreeNodeRecursion::Continue); + } + + let input_nodes = if !node.inputs().is_empty() { + self.traversal.pop().unwrap_or_default() + } else { + vec![] + }; + let stream_extension: &dyn StreamingOperatorBlueprint = node + .try_into() + .map_err(|e: DataFusionError| e.context("converting extension"))?; + self.build_extension(input_nodes, stream_extension)?; + + Ok(TreeNodeRecursion::Continue) + } +} diff --git a/src/sql/mod.rs b/src/sql/mod.rs index ed3c2e30..71dd4dd1 100644 --- a/src/sql/mod.rs +++ b/src/sql/mod.rs @@ -10,6 +10,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -pub mod parser; +pub mod api; +pub mod common; -pub use parser::SqlParser; +pub mod analysis; +pub mod functions; +pub mod logical_node; +pub mod logical_planner; +pub mod parse; +pub mod physical; +pub mod schema; +pub mod types; + +pub use analysis::rewrite_plan; diff --git a/src/sql/parse.rs b/src/sql/parse.rs new file mode 100644 index 00000000..0c6b9541 --- /dev/null +++ b/src/sql/parse.rs @@ -0,0 +1,402 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Coordinator-facing SQL parsing (`parse_sql`). +//! +//! **Data-definition / pipeline shape (this entry point)** +//! Only these table-related forms are supported: +//! - **`CREATE TABLE ... (cols [, WATERMARK FOR ...]) WITH ('connector' = '...', 'format' = '...', ...)`** +//! connector-backed **source** DDL (no `AS SELECT`; `connector` in `WITH` selects this path) +//! - **`CREATE TABLE ...`** other forms (including `CREATE TABLE ... AS SELECT` where DataFusion accepts it) +//! - **`CREATE STREAMING TABLE ... WITH (...) AS SELECT ...`** (streaming sink DDL) +//! - **`DROP TABLE`** / **`DROP TABLE IF EXISTS`** / **`DROP STREAMING TABLE`** (alias for `DROP TABLE` on the stream catalog) +//! - **`SHOW TABLES`** — list stream catalog tables (connector sources and streaming sinks) +//! - **`SHOW CREATE TABLE `** — best-effort DDL text (full `WITH` / `AS SELECT` may not be stored) +//! +//! **`INSERT` is not supported** here — use `CREATE TABLE ... AS SELECT` or +//! `CREATE STREAMING TABLE ... AS SELECT` to define the query shape instead. +//! +//! Other supported statements include function lifecycle (`CREATE FUNCTION WITH`, `START FUNCTION`, …). + +use std::collections::HashMap; + +use datafusion::common::{Result, plan_err}; +use datafusion::error::DataFusionError; +use datafusion::sql::sqlparser::ast::{ + ObjectType, ShowCreateObject, SqlOption, Statement as DFStatement, +}; +use datafusion::sql::sqlparser::dialect::FunctionStreamDialect; +use datafusion::sql::sqlparser::parser::Parser; + +use crate::coordinator::{ + CreateFunction, CreateTable, DropFunction, DropStreamingTableStatement, DropTableStatement, + ShowCatalogTables, ShowCreateStreamingTable, ShowCreateTable, ShowFunctions, + ShowStreamingTables, StartFunction, Statement as CoordinatorStatement, StopFunction, + StreamingTableStatement, +}; + +/// Streaming-specific SQL that the sqlparser dialect does not natively handle. +/// +/// Returns `Some(statement)` if the SQL was intercepted, `None` otherwise so +/// the caller falls through to the normal sqlparser pipeline. +fn try_parse_streaming_statement(sql: &str) -> Option> { + let tokens: Vec<&str> = sql.split_whitespace().collect(); + if tokens.is_empty() { + return None; + } + + // SHOW STREAMING TABLES + if tokens.len() == 3 + && tokens[0].eq_ignore_ascii_case("show") + && tokens[1].eq_ignore_ascii_case("streaming") + && tokens[2].eq_ignore_ascii_case("tables") + { + return Some(Box::new(ShowStreamingTables::new())); + } + + // SHOW CREATE STREAMING TABLE + if tokens.len() == 5 + && tokens[0].eq_ignore_ascii_case("show") + && tokens[1].eq_ignore_ascii_case("create") + && tokens[2].eq_ignore_ascii_case("streaming") + && tokens[3].eq_ignore_ascii_case("table") + { + let name = tokens[4].trim_end_matches(';').to_string(); + return Some(Box::new(ShowCreateStreamingTable::new(name))); + } + + // DROP STREAMING TABLE [IF EXISTS] + if tokens.len() >= 4 + && tokens[0].eq_ignore_ascii_case("drop") + && tokens[1].eq_ignore_ascii_case("streaming") + && tokens[2].eq_ignore_ascii_case("table") + { + let (if_exists, name_idx) = if tokens.len() >= 6 + && tokens[3].eq_ignore_ascii_case("if") + && tokens[4].eq_ignore_ascii_case("exists") + { + (true, 5) + } else { + (false, 3) + }; + + if name_idx >= tokens.len() { + return None; + } + let name = tokens[name_idx].trim_end_matches(';').to_string(); + return Some(Box::new(DropStreamingTableStatement::new(name, if_exists))); + } + + None +} + +pub fn parse_sql(query: &str) -> Result>> { + let trimmed = query.trim(); + if trimmed.is_empty() { + return plan_err!("Query is empty"); + } + + if let Some(stmt) = try_parse_streaming_statement(trimmed) { + return Ok(vec![stmt]); + } + + let dialect = FunctionStreamDialect {}; + let statements = Parser::parse_sql(&dialect, trimmed) + .map_err(|e| DataFusionError::Plan(format!("SQL parse error: {e}")))?; + + if statements.is_empty() { + return plan_err!("No SQL statements found"); + } + + statements.into_iter().map(classify_statement).collect() +} + +fn classify_statement(stmt: DFStatement) -> Result> { + match stmt { + DFStatement::CreateFunctionWith { options } => { + let properties = sql_options_to_map(&options); + let create_fn = CreateFunction::from_properties(properties) + .map_err(|e| DataFusionError::Plan(format!("CREATE FUNCTION: {e}")))?; + Ok(Box::new(create_fn)) + } + DFStatement::StartFunction { name } => Ok(Box::new(StartFunction::new(name.to_string()))), + DFStatement::StopFunction { name } => Ok(Box::new(StopFunction::new(name.to_string()))), + DFStatement::DropFunction { func_desc, .. } => { + let name = func_desc + .first() + .map(|d| d.name.to_string()) + .unwrap_or_default(); + Ok(Box::new(DropFunction::new(name))) + } + DFStatement::ShowFunctions { .. } => Ok(Box::new(ShowFunctions::new())), + DFStatement::ShowTables { .. } => Ok(Box::new(ShowCatalogTables::new())), + DFStatement::ShowCreate { obj_type, obj_name } => { + if obj_type != ShowCreateObject::Table { + return plan_err!( + "SHOW CREATE {obj_type} is not supported; use SHOW CREATE TABLE " + ); + } + Ok(Box::new(ShowCreateTable::new(obj_name.to_string()))) + } + s @ DFStatement::CreateTable(_) => Ok(Box::new(CreateTable::new(s))), + s @ DFStatement::CreateStreamingTable { .. } => { + Ok(Box::new(StreamingTableStatement::new(s))) + } + stmt @ DFStatement::Drop { .. } => { + { + let DFStatement::Drop { + object_type, names, .. + } = &stmt + else { + unreachable!() + }; + if *object_type != ObjectType::Table { + return plan_err!("Only DROP TABLE is supported in this SQL frontend"); + } + if names.len() != 1 { + return plan_err!("DROP TABLE supports exactly one table name per statement"); + } + } + Ok(Box::new(DropTableStatement::new(stmt))) + } + DFStatement::Insert { .. } => plan_err!( + "INSERT is not supported; only CREATE TABLE and CREATE STREAMING TABLE (with AS SELECT) \ + are supported for defining table/query pipelines in this SQL frontend" + ), + other => plan_err!("Unsupported SQL statement: {other}"), + } +} + +/// Convert Vec (KeyValue pairs) into HashMap. +fn sql_options_to_map(options: &[SqlOption]) -> HashMap { + options + .iter() + .filter_map(|opt| match opt { + SqlOption::KeyValue { key, value } => Some(( + key.value.clone(), + value.to_string().trim_matches('\'').to_string(), + )), + _ => None, + }) + .collect() +} + +#[cfg(test)] +mod tests { + use super::*; + + fn first_stmt(sql: &str) -> Box { + let mut stmts = parse_sql(sql).unwrap(); + assert!(!stmts.is_empty()); + stmts.remove(0) + } + + fn is_type(stmt: &dyn CoordinatorStatement, prefix: &str) -> bool { + format!("{:?}", stmt).starts_with(prefix) + } + + #[test] + fn test_parse_create_function() { + let sql = + "CREATE FUNCTION WITH ('function_path'='./test.wasm', 'config_path'='./config.yml')"; + let stmt = first_stmt(sql); + assert!(is_type(stmt.as_ref(), "CreateFunction")); + } + + #[test] + fn test_parse_create_function_minimal() { + let sql = "CREATE FUNCTION WITH ('function_path'='./processor.wasm')"; + let stmt = first_stmt(sql); + assert!(is_type(stmt.as_ref(), "CreateFunction")); + } + + #[test] + fn test_parse_drop_function() { + let stmt = first_stmt("DROP FUNCTION my_task"); + assert!(is_type(stmt.as_ref(), "DropFunction")); + } + + #[test] + fn test_parse_start_function() { + let stmt = first_stmt("START FUNCTION my_task"); + assert!(is_type(stmt.as_ref(), "StartFunction")); + } + + #[test] + fn test_parse_stop_function() { + let stmt = first_stmt("STOP FUNCTION my_task"); + assert!(is_type(stmt.as_ref(), "StopFunction")); + } + + #[test] + fn test_parse_show_functions() { + let stmt = first_stmt("SHOW FUNCTIONS"); + assert!(is_type(stmt.as_ref(), "ShowFunctions")); + } + + #[test] + fn test_parse_show_tables() { + let stmt = first_stmt("SHOW TABLES"); + assert!(is_type(stmt.as_ref(), "ShowCatalogTables")); + } + + #[test] + fn test_parse_show_create_table() { + let stmt = first_stmt("SHOW CREATE TABLE my_src"); + assert!(is_type(stmt.as_ref(), "ShowCreateTable")); + } + + #[test] + fn test_parse_create_table() { + let stmt = first_stmt("CREATE TABLE foo (id INT, name VARCHAR)"); + assert!(is_type(stmt.as_ref(), "CreateTable")); + } + + #[test] + fn test_parse_create_table_connector_source_ddl() { + let sql = concat!( + "CREATE TABLE kafka_src (id BIGINT, ts TIMESTAMP NOT NULL, WATERMARK FOR ts) ", + "WITH ('connector' = 'kafka', 'format' = 'json', 'topic' = 'events')", + ); + let stmt = first_stmt(sql); + assert!(is_type(stmt.as_ref(), "CreateTable")); + } + + #[test] + fn test_parse_drop_table() { + let stmt = first_stmt("DROP TABLE foo"); + assert!(is_type(stmt.as_ref(), "DropTableStatement")); + } + + #[test] + fn test_parse_drop_table_if_exists() { + let stmt = first_stmt("DROP TABLE IF EXISTS foo"); + assert!(is_type(stmt.as_ref(), "DropTableStatement")); + } + + #[test] + fn test_parse_drop_streaming_table() { + let stmt = first_stmt("DROP STREAMING TABLE my_sink"); + assert!(is_type(stmt.as_ref(), "DropStreamingTableStatement")); + } + + #[test] + fn test_parse_drop_streaming_table_if_exists() { + let stmt = first_stmt("DROP STREAMING TABLE IF EXISTS my_sink"); + assert!(is_type(stmt.as_ref(), "DropStreamingTableStatement")); + } + + #[test] + fn test_parse_show_streaming_tables() { + let stmt = first_stmt("SHOW STREAMING TABLES"); + assert!(is_type(stmt.as_ref(), "ShowStreamingTables")); + } + + #[test] + fn test_parse_show_create_streaming_table() { + let stmt = first_stmt("SHOW CREATE STREAMING TABLE my_sink"); + assert!(is_type(stmt.as_ref(), "ShowCreateStreamingTable")); + } + + /// `CREATE STREAMING TABLE` is the sink DDL supported by FunctionStream (not `CREATE STREAM TABLE`). + #[test] + fn test_parse_create_streaming_table() { + let sql = concat!( + "CREATE STREAMING TABLE my_sink ", + "WITH ('connector' = 'kafka') ", + "AS SELECT id FROM src", + ); + let stmt = first_stmt(sql); + assert!( + is_type(stmt.as_ref(), "StreamingTableStatement"), + "expected StreamingTableStatement, got {:?}", + stmt + ); + } + + #[test] + fn test_parse_create_streaming_table_case_insensitive() { + let sql = concat!( + "create streaming table out_q ", + "with ('connector' = 'memory') ", + "as select 1 as x", + ); + let stmt = first_stmt(sql); + assert!(is_type(stmt.as_ref(), "StreamingTableStatement")); + } + + #[test] + fn test_parse_case_insensitive() { + assert!(is_type( + first_stmt("create function with ('function_path'='./test.wasm')").as_ref(), + "CreateFunction" + )); + assert!(is_type( + first_stmt("show functions").as_ref(), + "ShowFunctions" + )); + assert!(is_type( + first_stmt("start function my_task").as_ref(), + "StartFunction" + )); + } + + #[test] + fn test_parse_multiple_statements() { + let sql = concat!( + "CREATE TABLE t1 (id INT); ", + "CREATE STREAMING TABLE sk WITH ('connector' = 'kafka') AS SELECT id FROM t1", + ); + let stmts = parse_sql(sql).unwrap(); + assert_eq!(stmts.len(), 2); + assert!(is_type(stmts[0].as_ref(), "CreateTable")); + assert!(is_type(stmts[1].as_ref(), "StreamingTableStatement")); + } + + #[test] + fn test_parse_empty() { + assert!(parse_sql("").is_err()); + assert!(parse_sql(" ").is_err()); + } + + #[test] + fn test_parse_unsupported_statement() { + let result = parse_sql("SELECT 1"); + assert!(result.is_err()); + } + + #[test] + fn test_insert_not_supported() { + let err = parse_sql("INSERT INTO sink SELECT * FROM src").unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("INSERT") && msg.contains("not supported"), + "expected explicit INSERT rejection, got: {msg}" + ); + assert!( + msg.contains("CREATE TABLE") || msg.contains("CREATE STREAMING TABLE"), + "error should mention supported alternatives, got: {msg}" + ); + } + + #[test] + fn test_parse_with_extra_properties() { + let sql = r#"CREATE FUNCTION WITH ( + 'function_path'='./test.wasm', + 'config_path'='./config.yml', + 'parallelism'='4', + 'memory-limit'='256mb' + )"#; + let stmt = first_stmt(sql); + assert!(is_type(stmt.as_ref(), "CreateFunction")); + } +} diff --git a/src/sql/parser/sql_parser.rs b/src/sql/parser/sql_parser.rs deleted file mode 100644 index dc110745..00000000 --- a/src/sql/parser/sql_parser.rs +++ /dev/null @@ -1,249 +0,0 @@ -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use pest::Parser; -use pest_derive::Parser; - -use super::ParseError; -use crate::coordinator::{ - CreateFunction, DropFunction, ShowFunctions, StartFunction, Statement, StopFunction, -}; -use std::collections::HashMap; - -#[derive(Parser)] -#[grammar = "src/sql/grammar.pest"] -struct Grammar; - -#[derive(Debug, Default)] -pub struct SqlParser; - -impl SqlParser { - pub fn parse(sql: &str) -> Result, ParseError> { - let pairs = Grammar::parse(Rule::statement, sql) - .map_err(|e| ParseError::new(format!("Parse error: {}", e)))?; - - for pair in pairs { - return match pair.as_rule() { - Rule::create_stmt => { - handle_create_stmt(pair).map(|stmt| stmt as Box) - } - Rule::drop_stmt => handle_drop_stmt(pair).map(|stmt| stmt as Box), - Rule::start_stmt => handle_start_stmt(pair).map(|stmt| stmt as Box), - Rule::stop_stmt => handle_stop_stmt(pair).map(|stmt| stmt as Box), - Rule::show_stmt => handle_show_stmt(pair).map(|stmt| stmt as Box), - _ => continue, - }; - } - - Err(ParseError::new("Unknown statement type")) - } -} - -fn handle_create_stmt( - pair: pest::iterators::Pair, -) -> Result, ParseError> { - let mut inner = pair.into_inner(); - // Note: name is read from config file, not from SQL statement - // Pass empty string here, name will be read from config file later - let properties = inner - .next() - .map(parse_properties) - .ok_or_else(|| ParseError::new("Missing WITH clause"))?; - - Ok(Box::new( - CreateFunction::from_properties(properties).map_err(ParseError::from)?, - )) -} - -fn handle_drop_stmt(pair: pest::iterators::Pair) -> Result, ParseError> { - let mut inner = pair.into_inner(); - let name = inner.next().map(extract_string).unwrap_or_default(); - Ok(Box::new(DropFunction::new(name))) -} - -fn handle_start_stmt(pair: pest::iterators::Pair) -> Result, ParseError> { - let mut inner = pair.into_inner(); - let name = inner.next().map(extract_string).unwrap_or_default(); - Ok(Box::new(StartFunction::new(name))) -} - -fn handle_stop_stmt(pair: pest::iterators::Pair) -> Result, ParseError> { - let mut inner = pair.into_inner(); - let name = inner.next().map(extract_string).unwrap_or_default(); - Ok(Box::new(StopFunction::new(name))) -} - -fn handle_show_stmt(_pair: pest::iterators::Pair) -> Result, ParseError> { - Ok(Box::new(ShowFunctions::new())) -} - -fn extract_string(pair: pest::iterators::Pair) -> String { - match pair.as_rule() { - Rule::string_literal => { - let s = pair.as_str(); - if (s.starts_with('\'') && s.ends_with('\'')) - || (s.starts_with('"') && s.ends_with('"')) - { - unescape_string(&s[1..s.len() - 1]) - } else { - unescape_string(s) - } - } - Rule::identifier => pair.as_str().to_string(), - _ => pair.as_str().to_string(), - } -} - -fn unescape_string(s: &str) -> String { - let mut result = String::with_capacity(s.len()); - let mut chars = s.chars().peekable(); - - while let Some(ch) = chars.next() { - if ch == '\\' { - if let Some(&next) = chars.peek() { - chars.next(); - match next { - 'n' => result.push('\n'), - 't' => result.push('\t'), - 'r' => result.push('\r'), - '\\' => result.push('\\'), - '\'' => result.push('\''), - '"' => result.push('"'), - _ => { - result.push('\\'); - result.push(next); - } - } - } else { - result.push(ch); - } - } else { - result.push(ch); - } - } - - result -} - -fn parse_properties(pair: pest::iterators::Pair) -> HashMap { - let mut properties = HashMap::new(); - - for prop in pair.into_inner() { - if prop.as_rule() == Rule::property { - let mut inner = prop.into_inner(); - if let (Some(key_pair), Some(val_pair)) = (inner.next(), inner.next()) { - let key = key_pair - .into_inner() - .next() - .map(extract_string) - .unwrap_or_default(); - let value = val_pair - .into_inner() - .next() - .map(extract_string) - .unwrap_or_default(); - properties.insert(key, value); - } - } - } - - properties -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_create_function() { - let sql = - "CREATE FUNCTION WITH ('function_path'='./test.wasm', 'config_path'='./config.yml')"; - let _stmt = SqlParser::parse(sql).unwrap(); - } - - #[test] - fn test_create_function_minimal() { - let sql = "CREATE FUNCTION WITH ('function_path'='./processor.wasm')"; - let _stmt = SqlParser::parse(sql).unwrap(); - } - - // Note: SQL only supports Path mode, not Bytes mode - // Bytes mode is only for gRPC requests - - #[test] - fn test_drop_function() { - let sql = "DROP FUNCTION my_task"; - let _stmt = SqlParser::parse(sql).unwrap(); - } - - #[test] - fn test_start_function() { - let sql = "START FUNCTION my_task"; - let _stmt = SqlParser::parse(sql).unwrap(); - } - - #[test] - fn test_stop_function() { - let sql = "STOP FUNCTION my_task"; - let _stmt = SqlParser::parse(sql).unwrap(); - } - - #[test] - fn test_show_functions() { - let sql = "SHOW FUNCTIONS"; - let _stmt = SqlParser::parse(sql).unwrap(); - } - - #[test] - fn test_case_insensitive_keywords() { - let sql1 = "create function with ('function_path'='./test.wasm')"; - let _stmt1 = SqlParser::parse(sql1).unwrap(); - - let sql2 = "Create Function With ('Function_Path'='./test.wasm')"; - let _stmt2 = SqlParser::parse(sql2).unwrap(); - - let sql3 = "show functions"; - let _stmt3 = SqlParser::parse(sql3).unwrap(); - - let sql4 = "start function my_task"; - let _stmt4 = SqlParser::parse(sql4).unwrap(); - } - - #[test] - fn test_case_insensitive_property_keys() { - let sql1 = - "CREATE FUNCTION WITH ('function_path'='./test.wasm', 'config_path'='./config.yml')"; - let _stmt1 = SqlParser::parse(sql1).unwrap(); - - let sql2 = - "CREATE FUNCTION WITH ('Function_Path'='./test.wasm', 'Config_Path'='./config.yml')"; - let _stmt2 = SqlParser::parse(sql2).unwrap(); - - let sql3 = - "CREATE FUNCTION WITH ('FUNCTION_PATH'='./test.wasm', 'CONFIG_PATH'='./config.yml')"; - let _stmt3 = SqlParser::parse(sql3).unwrap(); - - // Note: SQL only supports Path mode (function_path, config_path) - // Bytes mode (function, config) is only for gRPC requests - } - - #[test] - fn test_with_extra_properties() { - let sql = r#"CREATE FUNCTION WITH ( - 'function_path'='./test.wasm', - 'config_path'='./config.yml', - 'parallelism'='4', - 'memory-limit'='256mb' - )"#; - let _stmt = SqlParser::parse(sql).unwrap(); - } -} diff --git a/src/sql/physical/cdc/encode.rs b/src/sql/physical/cdc/encode.rs new file mode 100644 index 00000000..65ec758d --- /dev/null +++ b/src/sql/physical/cdc/encode.rs @@ -0,0 +1,342 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::collections::HashMap; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; + +use datafusion::arrow::array::AsArray; +use datafusion::arrow::array::{ + Array, BooleanArray, FixedSizeBinaryArray, PrimitiveArray, RecordBatch, StringArray, + StructArray, TimestampNanosecondBuilder, UInt32Array, UInt32Builder, +}; +use datafusion::arrow::compute::take; +use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimestampNanosecondType}; +use datafusion::common::{DataFusionError, Result}; +use datafusion::execution::{RecordBatchStream, SendableRecordBatchStream, TaskContext}; +use datafusion::physical_plan::{DisplayAs, ExecutionPlan, PlanProperties}; +use futures::{StreamExt, ready, stream::Stream}; + +use crate::sql::common::constants::{cdc, debezium_op_short, physical_plan_node_name}; +use crate::sql::common::{TIMESTAMP_FIELD, UPDATING_META_FIELD}; +use crate::sql::physical::source_exec::make_stream_properties; + +// ============================================================================ +// CdcDebeziumPackExec (Execution Plan Node) +// ============================================================================ + +/// Packs internal flat changelog rows into Debezium-style `before` / `after` / `op` / timestamp. +/// +/// Intended as the last physical node before a sink that expects Debezium CDC envelopes. +#[derive(Debug)] +pub struct CdcDebeziumPackExec { + input: Arc, + schema: SchemaRef, + properties: PlanProperties, +} + +impl CdcDebeziumPackExec { + pub fn try_new(input: Arc) -> Result { + let input_schema = input.schema(); + let timestamp_index = input_schema.index_of(TIMESTAMP_FIELD)?; + + let struct_fields: Vec<_> = input_schema + .fields() + .iter() + .enumerate() + .filter_map(|(index, field)| { + if field.name() == UPDATING_META_FIELD || index == timestamp_index { + None + } else { + Some(field.clone()) + } + }) + .collect(); + + let payload_struct_type = DataType::Struct(struct_fields.into()); + + let before_field = Arc::new(Field::new(cdc::BEFORE, payload_struct_type.clone(), true)); + let after_field = Arc::new(Field::new(cdc::AFTER, payload_struct_type, true)); + let op_field = Arc::new(Field::new(cdc::OP, DataType::Utf8, false)); + let timestamp_field = Arc::new(input_schema.field(timestamp_index).clone()); + + let output_schema = Arc::new(Schema::new(vec![ + before_field, + after_field, + op_field, + timestamp_field, + ])); + + Ok(Self { + input, + schema: output_schema.clone(), + properties: make_stream_properties(output_schema), + }) + } + + pub(crate) fn from_decoded_parts(input: Arc, schema: SchemaRef) -> Self { + Self { + properties: make_stream_properties(schema.clone()), + input, + schema, + } + } +} + +impl DisplayAs for CdcDebeziumPackExec { + fn fmt_as( + &self, + _t: datafusion::physical_plan::DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + write!(f, "CdcDebeziumPackExec") + } +} + +impl ExecutionPlan for CdcDebeziumPackExec { + fn name(&self) -> &str { + physical_plan_node_name::TO_DEBEZIUM_EXEC + } + + fn as_any(&self) -> &dyn Any { + self + } + fn schema(&self) -> SchemaRef { + self.schema.clone() + } + fn properties(&self) -> &PlanProperties { + &self.properties + } + fn children(&self) -> Vec<&Arc> { + vec![&self.input] + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> Result> { + if children.len() != 1 { + return Err(DataFusionError::Internal( + "CdcDebeziumPackExec expects exactly 1 child".into(), + )); + } + Ok(Arc::new(Self::try_new(children[0].clone())?)) + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> Result { + let updating_meta_index = self.input.schema().index_of(UPDATING_META_FIELD).ok(); + let timestamp_index = self.input.schema().index_of(TIMESTAMP_FIELD)?; + + let struct_projection = (0..self.input.schema().fields().len()) + .filter(|index| (updating_meta_index != Some(*index)) && *index != timestamp_index) + .collect(); + + Ok(Box::pin(CdcDebeziumPackStream { + input: self.input.execute(partition, context)?, + schema: self.schema.clone(), + updating_meta_index, + timestamp_index, + struct_projection, + })) + } + + fn reset(&self) -> Result<()> { + self.input.reset() + } +} + +// ============================================================================ +// CdcDebeziumPackStream (Physical Stream Execution) +// ============================================================================ + +struct CdcDebeziumPackStream { + input: SendableRecordBatchStream, + schema: SchemaRef, + updating_meta_index: Option, + timestamp_index: usize, + struct_projection: Vec, +} + +#[derive(Debug)] +struct RowCompactionState { + first_idx: usize, + last_idx: usize, + first_is_create: bool, + last_is_create: bool, + max_timestamp: i64, +} + +impl CdcDebeziumPackStream { + fn compact_changelog<'a>( + num_rows: usize, + is_retract: &'a BooleanArray, + id_array: &'a FixedSizeBinaryArray, + timestamps: &'a PrimitiveArray, + ) -> (Vec<&'a [u8]>, HashMap<&'a [u8], RowCompactionState>) { + let mut state_map: HashMap<&[u8], RowCompactionState> = HashMap::new(); + let mut unique_order = vec![]; + + for i in 0..num_rows { + let row_id = id_array.value(i); + let is_create = !is_retract.value(i); + let timestamp = timestamps.value(i); + + state_map + .entry(row_id) + .and_modify(|state| { + state.last_idx = i; + state.last_is_create = is_create; + state.max_timestamp = state.max_timestamp.max(timestamp); + }) + .or_insert_with(|| { + unique_order.push(row_id); + RowCompactionState { + first_idx: i, + last_idx: i, + first_is_create: is_create, + last_is_create: is_create, + max_timestamp: timestamp, + } + }); + } + (unique_order, state_map) + } + + fn as_debezium_batch(&mut self, batch: &RecordBatch) -> Result { + let value_struct = batch.project(&self.struct_projection)?; + let timestamps = batch + .column(self.timestamp_index) + .as_primitive::(); + + let columns: Vec> = if let Some(meta_index) = self.updating_meta_index { + let metadata = batch.column(meta_index).as_struct(); + let is_retract = metadata.column(0).as_boolean(); + let row_ids = metadata.column(1).as_fixed_size_binary(); + + let (ordered_ids, state_map) = + Self::compact_changelog(batch.num_rows(), is_retract, row_ids, timestamps); + + let mut before_builder = UInt32Builder::with_capacity(state_map.len()); + let mut after_builder = UInt32Builder::with_capacity(state_map.len()); + let mut op_vec = Vec::with_capacity(state_map.len()); + let mut ts_builder = TimestampNanosecondBuilder::with_capacity(state_map.len()); + + for row_id in ordered_ids { + let state = state_map + .get(row_id) + .expect("row id from order must exist in map"); + + match (state.first_is_create, state.last_is_create) { + (true, true) => { + before_builder.append_null(); + after_builder.append_value(state.last_idx as u32); + op_vec.push(debezium_op_short::CREATE); + } + (false, false) => { + before_builder.append_value(state.first_idx as u32); + after_builder.append_null(); + op_vec.push(debezium_op_short::DELETE); + } + (false, true) => { + before_builder.append_value(state.first_idx as u32); + after_builder.append_value(state.last_idx as u32); + op_vec.push(debezium_op_short::UPDATE); + } + (true, false) => { + continue; + } + } + ts_builder.append_value(state.max_timestamp); + } + + let before_indices = before_builder.finish(); + let after_indices = after_builder.finish(); + + let before_array = Self::take_struct_columns(&value_struct, &before_indices)?; + let after_array = Self::take_struct_columns(&value_struct, &after_indices)?; + let op_array = StringArray::from(op_vec); + + vec![ + Arc::new(before_array), + Arc::new(after_array), + Arc::new(op_array), + Arc::new(ts_builder.finish()), + ] + } else { + let num_rows = value_struct.num_rows(); + + let after_array = StructArray::try_new( + value_struct.schema().fields().clone(), + value_struct.columns().to_vec(), + None, + )?; + let before_array = + StructArray::new_null(value_struct.schema().fields().clone(), num_rows); + + let op_array = StringArray::from_iter_values(std::iter::repeat_n( + debezium_op_short::CREATE, + num_rows, + )); + + vec![ + Arc::new(before_array), + Arc::new(after_array), + Arc::new(op_array), + batch.column(self.timestamp_index).clone(), + ] + }; + + Ok(RecordBatch::try_new(self.schema.clone(), columns)?) + } + + fn take_struct_columns( + value_struct: &RecordBatch, + indices: &UInt32Array, + ) -> Result { + let mut arrays: Vec> = Vec::with_capacity(value_struct.num_columns()); + + for col in value_struct.columns() { + arrays.push(take(col.as_ref(), indices, None)?); + } + + Ok(StructArray::try_new( + value_struct.schema().fields().clone(), + arrays, + indices.nulls().cloned(), + )?) + } +} + +impl Stream for CdcDebeziumPackStream { + type Item = Result; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let this = self.as_mut().get_mut(); + match ready!(this.input.poll_next_unpin(cx)) { + Some(Ok(batch)) => Poll::Ready(Some(this.as_debezium_batch(&batch))), + Some(Err(e)) => Poll::Ready(Some(Err(e))), + None => Poll::Ready(None), + } + } +} + +impl RecordBatchStream for CdcDebeziumPackStream { + fn schema(&self) -> SchemaRef { + self.schema.clone() + } +} diff --git a/src/sql/physical/cdc/mod.rs b/src/sql/physical/cdc/mod.rs new file mode 100644 index 00000000..216dd4c1 --- /dev/null +++ b/src/sql/physical/cdc/mod.rs @@ -0,0 +1,17 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod encode; +mod unroll; + +pub use encode::CdcDebeziumPackExec; +pub use unroll::CdcDebeziumUnrollExec; diff --git a/src/sql/physical/cdc/unroll.rs b/src/sql/physical/cdc/unroll.rs new file mode 100644 index 00000000..10c62c6c --- /dev/null +++ b/src/sql/physical/cdc/unroll.rs @@ -0,0 +1,322 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; + +use datafusion::arrow::array::AsArray; +use datafusion::arrow::array::{ + BooleanBuilder, RecordBatch, StructArray, TimestampNanosecondBuilder, UInt32Builder, +}; +use datafusion::arrow::compute::{concat, take}; +use datafusion::arrow::datatypes::{ + DataType, Field, Schema, SchemaRef, TimeUnit, TimestampNanosecondType, +}; +use datafusion::common::{DataFusionError, Result, plan_err}; +use datafusion::execution::{RecordBatchStream, SendableRecordBatchStream, TaskContext}; +use datafusion::logical_expr::ColumnarValue; +use datafusion::physical_plan::{DisplayAs, ExecutionPlan, PlanProperties}; +use futures::{StreamExt, ready, stream::Stream}; + +use crate::sql::common::TIMESTAMP_FIELD; +use crate::sql::common::constants::{cdc, debezium_op_short, physical_plan_node_name}; +use crate::sql::functions::MultiHashFunction; +use crate::sql::physical::meta::{updating_meta_field, updating_meta_fields}; +use crate::sql::physical::source_exec::make_stream_properties; + +// ============================================================================ +// CdcDebeziumUnrollExec (Execution Plan Node) +// ============================================================================ + +/// Physical node that unrolls Debezium CDC payloads (`before` / `after` / `op`) into a flat +/// changelog stream with retract metadata. +/// +/// - `c` / `r` → emit `after` (`is_retract = false`) +/// - `d` → emit `before` (`is_retract = true`) +/// - `u` → emit `before` (retract) then `after` (insert) +#[derive(Debug)] +pub struct CdcDebeziumUnrollExec { + input: Arc, + schema: SchemaRef, + properties: PlanProperties, + primary_key_indices: Vec, +} + +impl CdcDebeziumUnrollExec { + /// Builds the node and validates Debezium payload schema constraints. + pub fn try_new(input: Arc, primary_key_indices: Vec) -> Result { + let input_schema = input.schema(); + + let before_index = input_schema.index_of(cdc::BEFORE)?; + let after_index = input_schema.index_of(cdc::AFTER)?; + let op_index = input_schema.index_of(cdc::OP)?; + let _timestamp_index = input_schema.index_of(TIMESTAMP_FIELD)?; + + let before_type = input_schema.field(before_index).data_type(); + let after_type = input_schema.field(after_index).data_type(); + + if before_type != after_type { + return Err(DataFusionError::Plan( + "CDC 'before' and 'after' columns must share the exact same DataType".to_string(), + )); + } + + if *input_schema.field(op_index).data_type() != DataType::Utf8 { + return Err(DataFusionError::Plan( + "CDC 'op' (operation) column must be of type Utf8 (String)".to_string(), + )); + } + + let DataType::Struct(fields) = before_type else { + return Err(DataFusionError::Plan( + "CDC 'before' and 'after' payload columns must be Structs".to_string(), + )); + }; + + let mut unrolled_fields = fields.to_vec(); + unrolled_fields.push(updating_meta_field()); + unrolled_fields.push(Arc::new(Field::new( + TIMESTAMP_FIELD, + DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + ))); + + let schema = Arc::new(Schema::new(unrolled_fields)); + + Ok(Self { + input, + schema: schema.clone(), + properties: make_stream_properties(schema), + primary_key_indices, + }) + } + + /// Used when deserializing a plan with a pre-baked output schema (see [`StreamingExtensionCodec`]). + pub(crate) fn from_decoded_parts( + input: Arc, + schema: SchemaRef, + primary_key_indices: Vec, + ) -> Self { + Self { + properties: make_stream_properties(schema.clone()), + input, + schema, + primary_key_indices, + } + } + + pub fn primary_key_indices(&self) -> &[usize] { + &self.primary_key_indices + } +} + +impl DisplayAs for CdcDebeziumUnrollExec { + fn fmt_as( + &self, + _t: datafusion::physical_plan::DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + write!(f, "CdcDebeziumUnrollExec") + } +} + +impl ExecutionPlan for CdcDebeziumUnrollExec { + fn name(&self) -> &str { + physical_plan_node_name::DEBEZIUM_UNROLLING_EXEC + } + + fn as_any(&self) -> &dyn Any { + self + } + fn schema(&self) -> SchemaRef { + self.schema.clone() + } + fn properties(&self) -> &PlanProperties { + &self.properties + } + fn children(&self) -> Vec<&Arc> { + vec![&self.input] + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> Result> { + if children.len() != 1 { + return Err(DataFusionError::Internal( + "CdcDebeziumUnrollExec expects exactly one child".to_string(), + )); + } + Ok(Arc::new(Self { + input: children[0].clone(), + schema: self.schema.clone(), + properties: self.properties.clone(), + primary_key_indices: self.primary_key_indices.clone(), + })) + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> Result { + Ok(Box::pin(CdcDebeziumUnrollStream::try_new( + self.input.execute(partition, context)?, + self.schema.clone(), + self.primary_key_indices.clone(), + )?)) + } + + fn reset(&self) -> Result<()> { + self.input.reset() + } +} + +// ============================================================================ +// CdcDebeziumUnrollStream (Physical Stream Execution) +// ============================================================================ + +struct CdcDebeziumUnrollStream { + input: SendableRecordBatchStream, + schema: SchemaRef, + before_index: usize, + after_index: usize, + op_index: usize, + timestamp_index: usize, + primary_key_indices: Vec, +} + +impl CdcDebeziumUnrollStream { + fn try_new( + input: SendableRecordBatchStream, + schema: SchemaRef, + primary_key_indices: Vec, + ) -> Result { + if primary_key_indices.is_empty() { + return plan_err!( + "A CDC source requires at least one primary key to maintain state correctly." + ); + } + + let input_schema = input.schema(); + Ok(Self { + input, + schema, + before_index: input_schema.index_of(cdc::BEFORE)?, + after_index: input_schema.index_of(cdc::AFTER)?, + op_index: input_schema.index_of(cdc::OP)?, + timestamp_index: input_schema.index_of(TIMESTAMP_FIELD)?, + primary_key_indices, + }) + } + + fn unroll_batch(&self, batch: &RecordBatch) -> Result { + let num_rows = batch.num_rows(); + if num_rows == 0 { + return Ok(RecordBatch::new_empty(self.schema.clone())); + } + + let before_col = batch.column(self.before_index); + let after_col = batch.column(self.after_index); + + let op_array = batch.column(self.op_index).as_string::(); + let timestamp_array = batch + .column(self.timestamp_index) + .as_primitive::(); + + let max_capacity = num_rows * 2; + let mut take_indices = UInt32Builder::with_capacity(max_capacity); + let mut is_retract_builder = BooleanBuilder::with_capacity(max_capacity); + let mut timestamp_builder = TimestampNanosecondBuilder::with_capacity(max_capacity); + + for i in 0..num_rows { + let op = op_array.value(i); + let ts = timestamp_array.value(i); + + match op { + debezium_op_short::CREATE | debezium_op_short::READ => { + take_indices.append_value((i + num_rows) as u32); + is_retract_builder.append_value(false); + timestamp_builder.append_value(ts); + } + debezium_op_short::DELETE => { + take_indices.append_value(i as u32); + is_retract_builder.append_value(true); + timestamp_builder.append_value(ts); + } + debezium_op_short::UPDATE => { + take_indices.append_value(i as u32); + is_retract_builder.append_value(true); + timestamp_builder.append_value(ts); + + take_indices.append_value((i + num_rows) as u32); + is_retract_builder.append_value(false); + timestamp_builder.append_value(ts); + } + _ => { + return Err(DataFusionError::Execution(format!( + "Encountered unexpected Debezium operation code: '{op}'" + ))); + } + } + } + + let take_indices = take_indices.finish(); + let unrolled_row_count = take_indices.len(); + + let combined_array = concat(&[before_col.as_ref(), after_col.as_ref()])?; + let unrolled_array = take(&combined_array, &take_indices, None)?; + + let mut final_columns = unrolled_array.as_struct().columns().to_vec(); + + let pk_columns: Vec = self + .primary_key_indices + .iter() + .map(|&idx| ColumnarValue::Array(Arc::clone(&final_columns[idx]))) + .collect(); + + let hash_column = MultiHashFunction::default().invoke(&pk_columns)?; + let ids_array = hash_column.into_array(unrolled_row_count)?; + + let meta_struct = StructArray::try_new( + updating_meta_fields(), + vec![Arc::new(is_retract_builder.finish()), ids_array], + None, + )?; + + final_columns.push(Arc::new(meta_struct)); + final_columns.push(Arc::new(timestamp_builder.finish())); + + Ok(RecordBatch::try_new(self.schema.clone(), final_columns)?) + } +} + +impl Stream for CdcDebeziumUnrollStream { + type Item = Result; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let this = self.as_mut().get_mut(); + match ready!(this.input.poll_next_unpin(cx)) { + Some(Ok(batch)) => Poll::Ready(Some(this.unroll_batch(&batch))), + Some(Err(e)) => Poll::Ready(Some(Err(e))), + None => Poll::Ready(None), + } + } +} + +impl RecordBatchStream for CdcDebeziumUnrollStream { + fn schema(&self) -> SchemaRef { + self.schema.clone() + } +} diff --git a/src/sql/physical/codec.rs b/src/sql/physical/codec.rs new file mode 100644 index 00000000..1b96a9d6 --- /dev/null +++ b/src/sql/physical/codec.rs @@ -0,0 +1,307 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use datafusion::arrow::array::RecordBatch; +use datafusion::arrow::datatypes::Schema; +use datafusion::common::{DataFusionError, Result, UnnestOptions, not_impl_err}; +use datafusion::execution::FunctionRegistry; +use datafusion::logical_expr::ScalarUDF; +use datafusion::physical_plan::ExecutionPlan; +use datafusion::physical_plan::unnest::{ListUnnest, UnnestExec}; +use datafusion_proto::physical_plan::PhysicalExtensionCodec; +use prost::Message; +use protocol::function_stream_graph::{ + DebeziumDecodeNode, DebeziumEncodeNode, FsExecNode, MemExecNode, UnnestExecNode, + fs_exec_node::Node, +}; +use tokio::sync::mpsc::UnboundedReceiver; + +use crate::sql::analysis::UNNESTED_COL; +use crate::sql::common::constants::{mem_exec_join_side, window_function_udf}; +use crate::sql::physical::cdc::{CdcDebeziumPackExec, CdcDebeziumUnrollExec}; +use crate::sql::physical::source_exec::{ + BufferedBatchesExec, InjectableSingleBatchExec, MpscReceiverStreamExec, PlanningPlaceholderExec, +}; +use crate::sql::physical::udfs::window; + +// ============================================================================ +// StreamingExtensionCodec & StreamingDecodingContext +// ============================================================================ + +/// Worker-side context used when deserializing a physical plan from the coordinator. +/// +/// Planning uses [`PlanningPlaceholderExec`]; at runtime this selects the real source +/// implementation (locked batch, MPSC stream, join sides, etc.). +#[derive(Debug)] +pub enum StreamingDecodingContext { + None, + Planning, + SingleLockedBatch(Arc>>), + UnboundedBatchStream(Arc>>>), + LockedBatchVec(Arc>>), + LockedJoinPair { + left: Arc>>, + right: Arc>>, + }, + LockedJoinStream { + left: Arc>>>, + right: Arc>>>, + }, +} + +/// Codec for custom streaming physical extension nodes (`FsExecNode` protobuf). +#[derive(Debug)] +pub struct StreamingExtensionCodec { + pub context: StreamingDecodingContext, +} + +impl Default for StreamingExtensionCodec { + fn default() -> Self { + Self { + context: StreamingDecodingContext::None, + } + } +} + +impl PhysicalExtensionCodec for StreamingExtensionCodec { + fn try_decode( + &self, + buf: &[u8], + inputs: &[Arc], + _registry: &dyn FunctionRegistry, + ) -> Result> { + let exec: FsExecNode = Message::decode(buf).map_err(|err| { + DataFusionError::Internal(format!("Failed to deserialize FsExecNode protobuf: {err}")) + })?; + + let node = exec.node.ok_or_else(|| { + DataFusionError::Internal("Decoded FsExecNode contains no inner node data".to_string()) + })?; + + match node { + Node::MemExec(mem) => self.decode_placeholder_exec(mem), + Node::UnnestExec(unnest) => decode_unnest_exec(unnest, inputs), + Node::DebeziumDecode(debezium) => decode_debezium_unroll(debezium, inputs), + Node::DebeziumEncode(debezium) => decode_debezium_pack(debezium, inputs), + } + } + + fn try_encode(&self, node: Arc, buf: &mut Vec) -> Result<()> { + let mut proto = None; + + if let Some(table) = node.as_any().downcast_ref::() { + let schema_json = serde_json::to_string(&table.schema).map_err(|e| { + DataFusionError::Internal(format!("Failed to serialize schema to JSON: {e}")) + })?; + + proto = Some(FsExecNode { + node: Some(Node::MemExec(MemExecNode { + table_name: table.table_name.clone(), + schema: schema_json, + })), + }); + } else if let Some(unnest) = node.as_any().downcast_ref::() { + let schema_json = serde_json::to_string(&unnest.schema()).map_err(|e| { + DataFusionError::Internal(format!("Failed to serialize unnest schema to JSON: {e}")) + })?; + + proto = Some(FsExecNode { + node: Some(Node::UnnestExec(UnnestExecNode { + schema: schema_json, + })), + }); + } else if let Some(decode) = node.as_any().downcast_ref::() { + let schema_json = serde_json::to_string(decode.schema().as_ref()).map_err(|e| { + DataFusionError::Internal(format!("Failed to serialize CDC unroll schema: {e}")) + })?; + + proto = Some(FsExecNode { + node: Some(Node::DebeziumDecode(DebeziumDecodeNode { + schema: schema_json, + primary_keys: decode + .primary_key_indices() + .iter() + .map(|&c| c as u64) + .collect(), + })), + }); + } else if let Some(encode) = node.as_any().downcast_ref::() { + let schema_json = serde_json::to_string(encode.schema().as_ref()).map_err(|e| { + DataFusionError::Internal(format!("Failed to serialize CDC pack schema: {e}")) + })?; + + proto = Some(FsExecNode { + node: Some(Node::DebeziumEncode(DebeziumEncodeNode { + schema: schema_json, + })), + }); + } + + if let Some(proto_node) = proto { + proto_node.encode(buf).map_err(|err| { + DataFusionError::Internal(format!("Failed to encode protobuf node: {err}")) + })?; + Ok(()) + } else { + Err(DataFusionError::Internal(format!( + "Cannot serialize unknown physical plan node: {node:?}" + ))) + } + } + + fn try_decode_udf(&self, name: &str, _buf: &[u8]) -> Result> { + if name == window_function_udf::NAME { + return Ok(window()); + } + not_impl_err!("PhysicalExtensionCodec does not support scalar function '{name}'") + } +} + +impl StreamingExtensionCodec { + fn decode_placeholder_exec(&self, mem_exec: MemExecNode) -> Result> { + let schema: Schema = serde_json::from_str(&mem_exec.schema).map_err(|e| { + DataFusionError::Internal(format!("Invalid schema JSON in exec codec: {e:?}")) + })?; + let schema = Arc::new(schema); + + match &self.context { + StreamingDecodingContext::SingleLockedBatch(single_batch) => Ok(Arc::new( + InjectableSingleBatchExec::new(schema, single_batch.clone()), + )), + StreamingDecodingContext::UnboundedBatchStream(unbounded_stream) => Ok(Arc::new( + MpscReceiverStreamExec::new(schema, unbounded_stream.clone()), + )), + StreamingDecodingContext::LockedBatchVec(locked_batches) => Ok(Arc::new( + BufferedBatchesExec::new(schema, locked_batches.clone()), + )), + StreamingDecodingContext::Planning => Ok(Arc::new(PlanningPlaceholderExec::new( + mem_exec.table_name, + schema, + ))), + StreamingDecodingContext::None => Err(DataFusionError::Internal( + "A valid StreamingDecodingContext is required to decode placeholders into execution streams.".into(), + )), + StreamingDecodingContext::LockedJoinPair { left, right } => { + match mem_exec.table_name.as_str() { + mem_exec_join_side::LEFT => Ok(Arc::new(InjectableSingleBatchExec::new( + schema, + left.clone(), + ))), + mem_exec_join_side::RIGHT => Ok(Arc::new(InjectableSingleBatchExec::new( + schema, + right.clone(), + ))), + _ => Err(DataFusionError::Internal(format!( + "Unknown join side table name: {}", + mem_exec.table_name + ))), + } + } + StreamingDecodingContext::LockedJoinStream { left, right } => { + match mem_exec.table_name.as_str() { + mem_exec_join_side::LEFT => Ok(Arc::new(MpscReceiverStreamExec::new( + schema, + left.clone(), + ))), + mem_exec_join_side::RIGHT => Ok(Arc::new(MpscReceiverStreamExec::new( + schema, + right.clone(), + ))), + _ => Err(DataFusionError::Internal(format!( + "Unknown join side table name: {}", + mem_exec.table_name + ))), + } + } + } + } +} + +fn decode_unnest_exec( + unnest: UnnestExecNode, + inputs: &[Arc], +) -> Result> { + let schema: Schema = serde_json::from_str(&unnest.schema) + .map_err(|e| DataFusionError::Internal(format!("Invalid unnest schema JSON: {e:?}")))?; + + let column = schema.index_of(UNNESTED_COL).map_err(|_| { + DataFusionError::Internal(format!( + "Unnest schema missing required column: {UNNESTED_COL}" + )) + })?; + + let input = inputs.first().ok_or_else(|| { + DataFusionError::Internal("UnnestExec requires exactly one input plan".to_string()) + })?; + + Ok(Arc::new(UnnestExec::new( + input.clone(), + vec![ListUnnest { + index_in_input_schema: column, + depth: 1, + }], + vec![], + Arc::new(schema), + UnnestOptions::default(), + ))) +} + +fn decode_debezium_unroll( + debezium: DebeziumDecodeNode, + inputs: &[Arc], +) -> Result> { + let schema = Arc::new( + serde_json::from_str::(&debezium.schema).map_err(|e| { + DataFusionError::Internal(format!("Invalid DebeziumDecode schema JSON: {e:?}")) + })?, + ); + + let input = inputs.first().ok_or_else(|| { + DataFusionError::Internal( + "CdcDebeziumUnrollExec requires exactly one input plan".to_string(), + ) + })?; + + let primary_keys = debezium + .primary_keys + .into_iter() + .map(|c| c as usize) + .collect(); + + Ok(Arc::new(CdcDebeziumUnrollExec::from_decoded_parts( + input.clone(), + schema, + primary_keys, + ))) +} + +fn decode_debezium_pack( + debezium: DebeziumEncodeNode, + inputs: &[Arc], +) -> Result> { + let schema = Arc::new( + serde_json::from_str::(&debezium.schema).map_err(|e| { + DataFusionError::Internal(format!("Invalid DebeziumEncode schema JSON: {e:?}")) + })?, + ); + + let input = inputs.first().ok_or_else(|| { + DataFusionError::Internal("CdcDebeziumPackExec requires exactly one input plan".to_string()) + })?; + + Ok(Arc::new(CdcDebeziumPackExec::from_decoded_parts( + input.clone(), + schema, + ))) +} diff --git a/src/sql/physical/meta.rs b/src/sql/physical/meta.rs new file mode 100644 index 00000000..1387482c --- /dev/null +++ b/src/sql/physical/meta.rs @@ -0,0 +1,47 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::{Arc, OnceLock}; + +use datafusion::arrow::datatypes::{DataType, Field, Fields}; + +use crate::sql::common::UPDATING_META_FIELD; +use crate::sql::common::constants::updating_state_field; + +pub fn updating_meta_fields() -> Fields { + static FIELDS: OnceLock = OnceLock::new(); + FIELDS + .get_or_init(|| { + Fields::from(vec![ + Field::new(updating_state_field::IS_RETRACT, DataType::Boolean, true), + Field::new( + updating_state_field::ID, + DataType::FixedSizeBinary(16), + true, + ), + ]) + }) + .clone() +} + +pub fn updating_meta_field() -> Arc { + static FIELD: OnceLock> = OnceLock::new(); + FIELD + .get_or_init(|| { + Arc::new(Field::new( + UPDATING_META_FIELD, + DataType::Struct(updating_meta_fields()), + false, + )) + }) + .clone() +} diff --git a/src/sql/physical/mod.rs b/src/sql/physical/mod.rs new file mode 100644 index 00000000..77f5c107 --- /dev/null +++ b/src/sql/physical/mod.rs @@ -0,0 +1,23 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod cdc; +mod codec; +mod meta; +mod source_exec; +mod udfs; + +pub use cdc::{CdcDebeziumPackExec, CdcDebeziumUnrollExec}; +pub use codec::{StreamingDecodingContext, StreamingExtensionCodec}; +pub use meta::{updating_meta_field, updating_meta_fields}; +pub use source_exec::FsMemExec; +pub use udfs::window; diff --git a/src/sql/physical/source_exec.rs b/src/sql/physical/source_exec.rs new file mode 100644 index 00000000..fa65cbfd --- /dev/null +++ b/src/sql/physical/source_exec.rs @@ -0,0 +1,400 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::mem; +use std::sync::Arc; + +use datafusion::arrow::array::RecordBatch; +use datafusion::arrow::datatypes::SchemaRef; +use datafusion::catalog::memory::MemorySourceConfig; +use datafusion::common::{DataFusionError, Result, Statistics, not_impl_err, plan_err}; +use datafusion::datasource::memory::DataSourceExec; +use datafusion::execution::{SendableRecordBatchStream, TaskContext}; +use datafusion::physical_expr::EquivalenceProperties; +use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType}; +use datafusion::physical_plan::memory::MemoryStream; +use datafusion::physical_plan::stream::RecordBatchStreamAdapter; +use datafusion::physical_plan::{DisplayAs, ExecutionPlan, Partitioning, PlanProperties}; +use futures::StreamExt; +use tokio::sync::mpsc::UnboundedReceiver; +use tokio_stream::wrappers::UnboundedReceiverStream; + +use crate::sql::common::constants::physical_plan_node_name; + +/// Standard [`PlanProperties`] for a continuous, unbounded stream: incremental emission, +/// unknown partitioning, and unbounded boundedness (without requiring infinite memory). +pub(crate) fn create_unbounded_stream_properties(schema: SchemaRef) -> PlanProperties { + PlanProperties::new( + EquivalenceProperties::new(schema), + Partitioning::UnknownPartitioning(1), + EmissionType::Incremental, + Boundedness::Unbounded { + requires_infinite_memory: false, + }, + ) +} + +/// Alias for call sites that still use the older name. +pub(crate) fn make_stream_properties(schema: SchemaRef) -> PlanProperties { + create_unbounded_stream_properties(schema) +} + +// ============================================================================ +// InjectableSingleBatchExec (formerly RwLockRecordBatchReader) +// ============================================================================ + +/// Yields exactly one [`RecordBatch`], injected via a lock before `execute` runs. +/// +/// For event-driven loops that receive a single batch from the network and run a DataFusion +/// plan over it, the batch is stored in the lock until execution starts. +#[derive(Debug)] +pub(crate) struct InjectableSingleBatchExec { + schema: SchemaRef, + injected_batch: Arc>>, + properties: PlanProperties, +} + +impl InjectableSingleBatchExec { + pub(crate) fn new( + schema: SchemaRef, + injected_batch: Arc>>, + ) -> Self { + Self { + schema: schema.clone(), + injected_batch, + properties: create_unbounded_stream_properties(schema), + } + } +} + +impl DisplayAs for InjectableSingleBatchExec { + fn fmt_as( + &self, + _t: datafusion::physical_plan::DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + write!(f, "InjectableSingleBatchExec") + } +} + +impl ExecutionPlan for InjectableSingleBatchExec { + fn as_any(&self) -> &dyn Any { + self + } + fn schema(&self) -> SchemaRef { + self.schema.clone() + } + fn children(&self) -> Vec<&Arc> { + vec![] + } + fn properties(&self) -> &PlanProperties { + &self.properties + } + fn name(&self) -> &str { + physical_plan_node_name::RW_LOCK_READER + } + + fn with_new_children( + self: Arc, + _children: Vec>, + ) -> Result> { + Err(DataFusionError::Internal( + "InjectableSingleBatchExec does not support children".into(), + )) + } + + fn execute( + &self, + _partition: usize, + _context: Arc, + ) -> Result { + let mut guard = self.injected_batch.write().map_err(|e| { + DataFusionError::Execution(format!("Failed to acquire write lock: {e}")) + })?; + + let batch = guard.take().ok_or_else(|| { + DataFusionError::Execution( + "Execution triggered, but no RecordBatch was injected into the node.".into(), + ) + })?; + + Ok(Box::pin(MemoryStream::try_new( + vec![batch], + self.schema.clone(), + None, + )?)) + } + + fn statistics(&self) -> Result { + Ok(Statistics::new_unknown(&self.schema)) + } + + fn reset(&self) -> Result<()> { + Ok(()) + } +} + +// ============================================================================ +// MpscReceiverStreamExec (formerly UnboundedRecordBatchReader) +// ============================================================================ + +/// Unbounded streaming source backed by a Tokio `mpsc` receiver. +/// +/// Bridges async producers (e.g. network threads) into a DataFusion pipeline. +#[derive(Debug)] +pub(crate) struct MpscReceiverStreamExec { + schema: SchemaRef, + channel_receiver: Arc>>>, + properties: PlanProperties, +} + +impl MpscReceiverStreamExec { + pub(crate) fn new( + schema: SchemaRef, + channel_receiver: Arc>>>, + ) -> Self { + Self { + schema: schema.clone(), + channel_receiver, + properties: create_unbounded_stream_properties(schema), + } + } +} + +impl DisplayAs for MpscReceiverStreamExec { + fn fmt_as( + &self, + _t: datafusion::physical_plan::DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + write!(f, "MpscReceiverStreamExec") + } +} + +impl ExecutionPlan for MpscReceiverStreamExec { + fn as_any(&self) -> &dyn Any { + self + } + fn schema(&self) -> SchemaRef { + self.schema.clone() + } + fn children(&self) -> Vec<&Arc> { + vec![] + } + fn properties(&self) -> &PlanProperties { + &self.properties + } + fn name(&self) -> &str { + physical_plan_node_name::UNBOUNDED_READER + } + + fn with_new_children( + self: Arc, + _children: Vec>, + ) -> Result> { + Err(DataFusionError::Internal( + "MpscReceiverStreamExec does not support children".into(), + )) + } + + fn execute( + &self, + _partition: usize, + _context: Arc, + ) -> Result { + let mut guard = self.channel_receiver.write().map_err(|e| { + DataFusionError::Execution(format!("Failed to acquire lock for MPSC receiver: {e}")) + })?; + + let receiver = guard.take().ok_or_else(|| { + DataFusionError::Execution( + "The MPSC receiver was already consumed by a previous execution.".into(), + ) + })?; + + Ok(Box::pin(RecordBatchStreamAdapter::new( + self.schema.clone(), + UnboundedReceiverStream::new(receiver).map(Ok), + ))) + } + + fn statistics(&self) -> Result { + Ok(Statistics::new_unknown(&self.schema)) + } + + fn reset(&self) -> Result<()> { + Ok(()) + } +} + +// ============================================================================ +// BufferedBatchesExec (formerly RecordBatchVecReader) +// ============================================================================ + +/// Drains a growable, locked `Vec` when `execute` runs (micro-batching). +#[derive(Debug)] +pub(crate) struct BufferedBatchesExec { + schema: SchemaRef, + buffered_batches: Arc>>, + properties: PlanProperties, +} + +impl BufferedBatchesExec { + pub(crate) fn new( + schema: SchemaRef, + buffered_batches: Arc>>, + ) -> Self { + Self { + schema: schema.clone(), + buffered_batches, + properties: create_unbounded_stream_properties(schema), + } + } +} + +impl DisplayAs for BufferedBatchesExec { + fn fmt_as( + &self, + _t: datafusion::physical_plan::DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + write!(f, "BufferedBatchesExec") + } +} + +impl ExecutionPlan for BufferedBatchesExec { + fn as_any(&self) -> &dyn Any { + self + } + fn schema(&self) -> SchemaRef { + self.schema.clone() + } + fn children(&self) -> Vec<&Arc> { + vec![] + } + fn properties(&self) -> &PlanProperties { + &self.properties + } + fn name(&self) -> &str { + physical_plan_node_name::VEC_READER + } + + fn with_new_children( + self: Arc, + _children: Vec>, + ) -> Result> { + Err(DataFusionError::Internal( + "BufferedBatchesExec does not support children".into(), + )) + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> Result { + let mut guard = self.buffered_batches.write().map_err(|e| { + DataFusionError::Execution(format!("Failed to acquire lock for buffered batches: {e}")) + })?; + + let accumulated_batches = mem::take(&mut *guard); + + let memory_config = + MemorySourceConfig::try_new(&[accumulated_batches], self.schema.clone(), None)?; + + DataSourceExec::new(Arc::new(memory_config)).execute(partition, context) + } + + fn statistics(&self) -> Result { + Ok(Statistics::new_unknown(&self.schema)) + } + + fn reset(&self) -> Result<()> { + Ok(()) + } +} + +// ============================================================================ +#[derive(Debug, Clone)] +pub struct PlanningPlaceholderExec { + pub table_name: String, + pub schema: SchemaRef, + properties: PlanProperties, +} + +impl PlanningPlaceholderExec { + pub fn new(table_name: String, schema: SchemaRef) -> Self { + Self { + schema: schema.clone(), + table_name, + properties: create_unbounded_stream_properties(schema), + } + } +} + +impl DisplayAs for PlanningPlaceholderExec { + fn fmt_as( + &self, + _t: datafusion::physical_plan::DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + write!(f, "PlanningPlaceholderExec: schema={}", self.schema) + } +} + +impl ExecutionPlan for PlanningPlaceholderExec { + fn as_any(&self) -> &dyn Any { + self + } + fn schema(&self) -> SchemaRef { + self.schema.clone() + } + fn children(&self) -> Vec<&Arc> { + vec![] + } + fn properties(&self) -> &PlanProperties { + &self.properties + } + fn name(&self) -> &str { + physical_plan_node_name::MEM_EXEC + } + + fn with_new_children( + self: Arc, + _children: Vec>, + ) -> Result> { + not_impl_err!("PlanningPlaceholderExec does not accept children.") + } + + fn execute( + &self, + _partition: usize, + _context: Arc, + ) -> Result { + plan_err!("PlanningPlaceholderExec cannot be executed; swap for a real source before run.") + } + + fn statistics(&self) -> Result { + Ok(Statistics::new_unknown(&self.schema)) + } + + fn reset(&self) -> Result<()> { + Ok(()) + } +} + +// Backward-compatible aliases +pub type FsMemExec = PlanningPlaceholderExec; +pub type RwLockRecordBatchReader = InjectableSingleBatchExec; +pub type UnboundedRecordBatchReader = MpscReceiverStreamExec; +pub type RecordBatchVecReader = BufferedBatchesExec; diff --git a/src/sql/physical/udfs.rs b/src/sql/physical/udfs.rs new file mode 100644 index 00000000..767abf06 --- /dev/null +++ b/src/sql/physical/udfs.rs @@ -0,0 +1,138 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::sync::Arc; + +use datafusion::arrow::array::StructArray; +use datafusion::arrow::datatypes::{DataType, TimeUnit}; +use datafusion::common::{Result, ScalarValue, exec_err}; +use datafusion::logical_expr::{ + ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature, Volatility, +}; + +use crate::make_udf_function; +use crate::sql::common::constants::window_function_udf; +use crate::sql::schema::utils::window_arrow_struct; + +// ============================================================================ +// WindowFunctionUdf (User-Defined Scalar Function) +// ============================================================================ + +/// UDF that packs two nanosecond timestamps into the canonical window `Struct` type. +/// +/// Stream SQL uses a single struct column `[start, end)` for tumbling/hopping windows; +/// this keeps `GROUP BY` and physical codec alignment on one Arrow shape. +#[derive(Debug)] +pub struct WindowFunctionUdf { + signature: Signature, +} + +impl Default for WindowFunctionUdf { + fn default() -> Self { + Self { + signature: Signature::new( + TypeSignature::Exact(vec![ + DataType::Timestamp(TimeUnit::Nanosecond, None), + DataType::Timestamp(TimeUnit::Nanosecond, None), + ]), + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for WindowFunctionUdf { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + window_function_udf::NAME + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(window_arrow_struct()) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + let columns = args.args; + + if columns.len() != 2 { + return exec_err!( + "Window UDF expected exactly 2 arguments, but received {}", + columns.len() + ); + } + + let DataType::Struct(fields) = window_arrow_struct() else { + return exec_err!( + "Internal Engine Error: window_arrow_struct() must return a Struct DataType" + ); + }; + + let start_val = &columns[0]; + let end_val = &columns[1]; + + if !matches!( + start_val.data_type(), + DataType::Timestamp(TimeUnit::Nanosecond, _) + ) { + return exec_err!("Window UDF expected first argument to be a Nanosecond Timestamp"); + } + if !matches!( + end_val.data_type(), + DataType::Timestamp(TimeUnit::Nanosecond, _) + ) { + return exec_err!("Window UDF expected second argument to be a Nanosecond Timestamp"); + } + + match (start_val, end_val) { + (ColumnarValue::Array(start_arr), ColumnarValue::Array(end_arr)) => { + let struct_array = + StructArray::try_new(fields, vec![start_arr.clone(), end_arr.clone()], None)?; + Ok(ColumnarValue::Array(Arc::new(struct_array))) + } + + (ColumnarValue::Array(start_arr), ColumnarValue::Scalar(end_scalar)) => { + let end_arr = end_scalar.to_array_of_size(start_arr.len())?; + let struct_array = + StructArray::try_new(fields, vec![start_arr.clone(), end_arr], None)?; + Ok(ColumnarValue::Array(Arc::new(struct_array))) + } + + (ColumnarValue::Scalar(start_scalar), ColumnarValue::Array(end_arr)) => { + let start_arr = start_scalar.to_array_of_size(end_arr.len())?; + let struct_array = + StructArray::try_new(fields, vec![start_arr, end_arr.clone()], None)?; + Ok(ColumnarValue::Array(Arc::new(struct_array))) + } + + (ColumnarValue::Scalar(start_scalar), ColumnarValue::Scalar(end_scalar)) => { + let struct_array = StructArray::try_new( + fields, + vec![start_scalar.to_array()?, end_scalar.to_array()?], + None, + )?; + Ok(ColumnarValue::Scalar(ScalarValue::Struct(Arc::new( + struct_array, + )))) + } + } + } +} + +make_udf_function!(WindowFunctionUdf, WINDOW_FUNCTION, window); diff --git a/src/sql/schema/catalog_ddl.rs b/src/sql/schema/catalog_ddl.rs new file mode 100644 index 00000000..45936912 --- /dev/null +++ b/src/sql/schema/catalog_ddl.rs @@ -0,0 +1,251 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Best-effort SQL text for catalog introspection (`SHOW CREATE TABLE`). + +use std::collections::BTreeMap; + +use datafusion::arrow::datatypes::{DataType, TimeUnit}; + +use super::schema_provider::StreamTable; +use super::table::Table as CatalogTable; +use crate::sql::logical_node::logical::LogicalProgram; + +fn data_type_sql(dt: &DataType) -> String { + match dt { + DataType::Null => "NULL".to_string(), + DataType::Boolean => "BOOLEAN".to_string(), + DataType::Int8 => "TINYINT".to_string(), + DataType::Int16 => "SMALLINT".to_string(), + DataType::Int32 => "INT".to_string(), + DataType::Int64 => "BIGINT".to_string(), + DataType::UInt8 => "TINYINT UNSIGNED".to_string(), + DataType::UInt16 => "SMALLINT UNSIGNED".to_string(), + DataType::UInt32 => "INT UNSIGNED".to_string(), + DataType::UInt64 => "BIGINT UNSIGNED".to_string(), + DataType::Float16 => "FLOAT".to_string(), + DataType::Float32 => "REAL".to_string(), + DataType::Float64 => "DOUBLE".to_string(), + DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => "VARCHAR".to_string(), + DataType::Binary | DataType::LargeBinary => "VARBINARY".to_string(), + DataType::Date32 => "DATE".to_string(), + DataType::Date64 => "DATE".to_string(), + DataType::Timestamp(unit, tz) => match (unit, tz) { + (TimeUnit::Second, None) => "TIMESTAMP(0)".to_string(), + (TimeUnit::Millisecond, None) => "TIMESTAMP(3)".to_string(), + (TimeUnit::Microsecond, None) => "TIMESTAMP(6)".to_string(), + (TimeUnit::Nanosecond, None) => "TIMESTAMP(9)".to_string(), + (_, Some(_)) => "TIMESTAMP WITH TIME ZONE".to_string(), + }, + DataType::Decimal128(p, s) => format!("DECIMAL({p},{s})"), + DataType::Decimal256(p, s) => format!("DECIMAL({p},{s})"), + _ => dt.to_string(), + } +} + +fn format_columns(schema: &datafusion::arrow::datatypes::Schema) -> Vec { + schema + .fields() + .iter() + .map(|f| { + let null = if f.is_nullable() { "" } else { " NOT NULL" }; + format!(" {} {}{}", f.name(), data_type_sql(f.data_type()), null) + }) + .collect() +} + +fn format_with_clause(opts: &BTreeMap) -> String { + if opts.is_empty() { + return "WITH ('connector' = '...', 'format' = '...');\n/* Original WITH options are not persisted in the stream catalog. */\n" + .to_string(); + } + let pairs: Vec = opts + .iter() + .map(|(k, v)| { + let k_esc = k.replace('\'', "''"); + let v_esc = v.replace('\'', "''"); + format!(" '{k_esc}' = '{v_esc}'") + }) + .collect(); + format!("WITH (\n{}\n);\n", pairs.join(",\n")) +} + +/// Single-line `col:TYPE` list for result grids. +pub fn schema_columns_one_line(schema: &datafusion::arrow::datatypes::Schema) -> String { + schema + .fields() + .iter() + .map(|f| format!("{}:{}", f.name(), data_type_sql(f.data_type()))) + .collect::>() + .join(", ") +} + +fn pipeline_summary_short(program: &LogicalProgram) -> String { + let mut parts: Vec = Vec::new(); + parts.push(format!("tasks={}", program.task_count())); + parts.push(format!("hash={}", program.get_hash())); + for nw in program.graph.node_weights() { + let chain = nw + .operator_chain + .operators + .iter() + .map(|o| format!("{}", o.operator_name)) + .collect::>() + .join("->"); + parts.push(format!("n{}:{}", nw.node_id, chain)); + } + parts.join(" | ") +} + +/// Extra fields for `SHOW TABLES` result grid (pipeline summary; no full Graphviz). +pub fn stream_table_row_detail(table: &StreamTable) -> String { + match table { + StreamTable::Source { + connector, + event_time_field, + watermark_field, + with_options, + .. + } => { + format!( + "connector={}, event_time={:?}, watermark={:?}, with_options={}", + connector, + event_time_field, + watermark_field, + with_options.len() + ) + } + StreamTable::Sink { program, .. } => pipeline_summary_short(program), + } +} + +fn pipeline_text(program: &LogicalProgram) -> String { + let mut lines: Vec = Vec::new(); + lines.push(format!("tasks_total: {}", program.task_count())); + lines.push(format!("program_hash: {}", program.get_hash())); + for nw in program.graph.node_weights() { + let chain = nw + .operator_chain + .operators + .iter() + .map(|o| format!("{}[{}]", o.operator_name, o.operator_id)) + .collect::>() + .join(" -> "); + lines.push(format!( + "node {} (parallelism={}): {chain}", + nw.node_id, nw.parallelism + )); + } + let dot = program.dot(); + const MAX_DOT: usize = 12_000; + if dot.len() > MAX_DOT { + lines.push(format!( + "graphviz_dot_truncated:\n{}... [{} more bytes]", + &dot[..MAX_DOT], + dot.len() - MAX_DOT + )); + } else { + lines.push(format!("graphviz_dot:\n{dot}")); + } + lines.join("\n") +} + +/// Human-readable `SHOW CREATE TABLE` text (sink `AS SELECT` is not stored). +pub fn show_create_stream_table(table: &StreamTable) -> String { + match table { + StreamTable::Source { + name, + connector, + schema, + event_time_field, + watermark_field, + with_options, + } => { + let cols = format_columns(schema); + let mut ddl = format!("CREATE TABLE {name} (\n{}\n)", cols.join(",\n")); + if let Some(e) = event_time_field { + ddl.push_str(&format!("\n/* EVENT TIME COLUMN: {e} */\n")); + } + if let Some(w) = watermark_field { + ddl.push_str(&format!("/* WATERMARK: {w} */\n")); + } + let mut merged_opts = with_options.clone(); + merged_opts + .entry("connector".to_string()) + .or_insert_with(|| connector.clone()); + ddl.push_str(&format_with_clause(&merged_opts)); + ddl + } + StreamTable::Sink { name, program } => { + let schema = program.egress_arrow_schema().unwrap_or_else(|| { + std::sync::Arc::new(datafusion::arrow::datatypes::Schema::empty()) + }); + let cols = format_columns(&schema); + let mut ddl = format!( + "CREATE STREAMING TABLE {name}\nWITH ('connector' = '...') AS SELECT ...\n/* Sink WITH / AS SELECT text is not stored. Output schema:\n{}\n*/\n\n", + cols.join(",\n") + ); + ddl.push_str("-- Resolved logical pipeline:\n"); + ddl.push_str(&pipeline_text(program)); + ddl.push('\n'); + ddl + } + } +} + +/// Extra fields for `SHOW TABLES` result grid for persisted catalog rows. +pub fn catalog_table_row_detail(table: &CatalogTable) -> String { + match table { + CatalogTable::ConnectorTable(source) => format!( + "kind=connector, connector={}, event_time={:?}, watermark={:?}, with_options={}", + source.connector(), + source.event_time_field(), + source.temporal_config.watermark_strategy_column, + source.catalog_with_options().len() + ), + CatalogTable::LookupTable(source) => format!( + "kind=lookup, connector={}, event_time={:?}, watermark={:?}, with_options={}", + source.connector(), + source.event_time_field(), + source.temporal_config.watermark_strategy_column, + source.catalog_with_options().len() + ), + CatalogTable::TableFromQuery { .. } => "kind=query".to_string(), + } +} + +/// Human-readable `SHOW CREATE TABLE` text for persisted catalog rows. +pub fn show_create_catalog_table(table: &CatalogTable) -> String { + match table { + CatalogTable::ConnectorTable(source) | CatalogTable::LookupTable(source) => { + let schema = source.produce_physical_schema(); + let cols = format_columns(&schema); + let mut ddl = format!("CREATE TABLE {} (\n{}\n)", source.name(), cols.join(",\n")); + if let Some(e) = source.event_time_field() { + ddl.push_str(&format!("\n/* EVENT TIME COLUMN: {e} */\n")); + } + if let Some(w) = source.temporal_config.watermark_strategy_column.as_deref() { + ddl.push_str(&format!("/* WATERMARK: {w} */\n")); + } + let mut opts = source.catalog_with_options().clone(); + opts.entry("connector".to_string()) + .or_insert_with(|| source.connector().to_string()); + ddl.push_str(&format_with_clause(&opts)); + ddl + } + CatalogTable::TableFromQuery { name, .. } => { + format!( + "CREATE TABLE {name} AS SELECT ...;\n/* logical query text is not persisted */\n" + ) + } + } +} diff --git a/src/sql/schema/column_descriptor.rs b/src/sql/schema/column_descriptor.rs new file mode 100644 index 00000000..4228816f --- /dev/null +++ b/src/sql/schema/column_descriptor.rs @@ -0,0 +1,144 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion::arrow::datatypes::{DataType, Field, TimeUnit}; +use datafusion::logical_expr::Expr; + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum ColumnDescriptor { + Physical(Field), + SystemMeta { field: Field, meta_key: String }, + Computed { field: Field, logic: Box }, +} + +impl ColumnDescriptor { + #[inline] + pub fn new_physical(field: Field) -> Self { + Self::Physical(field) + } + + #[inline] + pub fn new_system_meta(field: Field, meta_key: impl Into) -> Self { + Self::SystemMeta { + field, + meta_key: meta_key.into(), + } + } + + #[inline] + pub fn new_computed(field: Field, logic: Expr) -> Self { + Self::Computed { + field, + logic: Box::new(logic), + } + } + + #[inline] + pub fn arrow_field(&self) -> &Field { + match self { + Self::Physical(f) => f, + Self::SystemMeta { field: f, .. } => f, + Self::Computed { field: f, .. } => f, + } + } + + #[inline] + pub fn into_arrow_field(self) -> Field { + match self { + Self::Physical(f) => f, + Self::SystemMeta { field: f, .. } => f, + Self::Computed { field: f, .. } => f, + } + } + + #[inline] + pub fn is_computed(&self) -> bool { + matches!(self, Self::Computed { .. }) + } + + #[inline] + pub fn is_physical(&self) -> bool { + matches!(self, Self::Physical(_)) + } + + #[inline] + pub fn system_meta_key(&self) -> Option<&str> { + if let Self::SystemMeta { meta_key, .. } = self { + Some(meta_key.as_str()) + } else { + None + } + } + + #[inline] + pub fn computation_logic(&self) -> Option<&Expr> { + if let Self::Computed { logic, .. } = self { + Some(logic) + } else { + None + } + } + + #[inline] + pub fn data_type(&self) -> &DataType { + self.arrow_field().data_type() + } + + pub fn set_nullable(&mut self, nullable: bool) { + let f = match self { + Self::Physical(f) => f, + Self::SystemMeta { field, .. } => field, + Self::Computed { field, .. } => field, + }; + *f = Field::new(f.name(), f.data_type().clone(), nullable) + .with_metadata(f.metadata().clone()); + } + + pub fn force_precision(&mut self, unit: TimeUnit) { + match self { + Self::Physical(f) => { + if let DataType::Timestamp(_, tz) = f.data_type() { + *f = Field::new( + f.name(), + DataType::Timestamp(unit, tz.clone()), + f.is_nullable(), + ); + } + } + Self::SystemMeta { field, .. } => { + if let DataType::Timestamp(_, tz) = field.data_type() { + *field = Field::new( + field.name(), + DataType::Timestamp(unit, tz.clone()), + field.is_nullable(), + ); + } + } + Self::Computed { field, .. } => { + if let DataType::Timestamp(_, tz) = field.data_type() { + *field = Field::new( + field.name(), + DataType::Timestamp(unit, tz.clone()), + field.is_nullable(), + ); + } + } + } + } +} + +impl From for ColumnDescriptor { + #[inline] + fn from(field: Field) -> Self { + Self::Physical(field) + } +} diff --git a/src/sql/schema/connection_type.rs b/src/sql/schema/connection_type.rs new file mode 100644 index 00000000..06a3df92 --- /dev/null +++ b/src/sql/schema/connection_type.rs @@ -0,0 +1,31 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt; + +/// Describes the role of a connection in the streaming pipeline. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum ConnectionType { + Source, + Sink, + Lookup, +} + +impl fmt::Display for ConnectionType { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + ConnectionType::Source => write!(f, "source"), + ConnectionType::Sink => write!(f, "sink"), + ConnectionType::Lookup => write!(f, "lookup"), + } + } +} diff --git a/src/sql/schema/connector_config.rs b/src/sql/schema/connector_config.rs new file mode 100644 index 00000000..edb44bae --- /dev/null +++ b/src/sql/schema/connector_config.rs @@ -0,0 +1,72 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use protocol::function_stream_graph::{ + GenericConnectorConfig, KafkaSinkConfig, KafkaSourceConfig, connector_op, +}; + +#[derive(Debug, Clone)] +pub enum ConnectorConfig { + KafkaSource(KafkaSourceConfig), + KafkaSink(KafkaSinkConfig), + Generic(HashMap), +} + +impl ConnectorConfig { + pub fn to_proto_config(&self) -> connector_op::Config { + match self { + ConnectorConfig::KafkaSource(cfg) => connector_op::Config::KafkaSource(cfg.clone()), + ConnectorConfig::KafkaSink(cfg) => connector_op::Config::KafkaSink(cfg.clone()), + ConnectorConfig::Generic(props) => { + connector_op::Config::Generic(GenericConnectorConfig { + properties: props.clone(), + }) + } + } + } +} + +impl PartialEq for ConnectorConfig { + fn eq(&self, other: &Self) -> bool { + use prost::Message; + match (self, other) { + (ConnectorConfig::KafkaSource(a), ConnectorConfig::KafkaSource(b)) => { + a.encode_to_vec() == b.encode_to_vec() + } + (ConnectorConfig::KafkaSink(a), ConnectorConfig::KafkaSink(b)) => { + a.encode_to_vec() == b.encode_to_vec() + } + (ConnectorConfig::Generic(a), ConnectorConfig::Generic(b)) => a == b, + _ => false, + } + } +} + +impl Eq for ConnectorConfig {} + +impl std::hash::Hash for ConnectorConfig { + fn hash(&self, state: &mut H) { + use prost::Message; + std::mem::discriminant(self).hash(state); + match self { + ConnectorConfig::KafkaSource(cfg) => cfg.encode_to_vec().hash(state), + ConnectorConfig::KafkaSink(cfg) => cfg.encode_to_vec().hash(state), + ConnectorConfig::Generic(m) => { + let mut pairs: Vec<_> = m.iter().collect(); + pairs.sort_by_key(|(k, _)| (*k).clone()); + pairs.hash(state); + } + } + } +} diff --git a/src/sql/schema/data_encoding_format.rs b/src/sql/schema/data_encoding_format.rs new file mode 100644 index 00000000..1cd5c736 --- /dev/null +++ b/src/sql/schema/data_encoding_format.rs @@ -0,0 +1,88 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use datafusion::arrow::datatypes::{DataType, Field}; +use datafusion::common::{Result, plan_err}; + +use super::column_descriptor::ColumnDescriptor; +use crate::sql::common::Format; +use crate::sql::common::constants::{cdc, connection_format_value, with_opt_bool_str}; +use crate::sql::common::with_option_keys as opt; + +/// High-level payload encoding (orthogonal to `Format` wire details in `ConnectionSchema`). +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum DataEncodingFormat { + StandardJson, + DebeziumJson, + Avro, + Parquet, + Raw, +} + +impl DataEncodingFormat { + pub fn extract_from_map(opts: &HashMap) -> Result { + let format_str = opts + .get(opt::FORMAT) + .map(|s| s.as_str()) + .unwrap_or(opt::DEFAULT_FORMAT_VALUE); + let is_debezium = opts + .get(opt::FORMAT_DEBEZIUM_FLAG) + .or_else(|| opts.get(opt::JSON_DEBEZIUM)) + .map(|s| s == with_opt_bool_str::TRUE) + .unwrap_or(false); + + match (format_str, is_debezium) { + (f, true) if f == connection_format_value::JSON => Ok(Self::DebeziumJson), + (f, _) if f == connection_format_value::DEBEZIUM_JSON => Ok(Self::DebeziumJson), + (f, false) if f == connection_format_value::JSON => Ok(Self::StandardJson), + (f, _) if f == connection_format_value::AVRO => Ok(Self::Avro), + (f, _) if f == connection_format_value::PARQUET => Ok(Self::Parquet), + _ => Ok(Self::Raw), + } + } + + pub fn from_connection_format(format: &Format) -> Self { + match format { + Format::Json(j) if j.debezium => Self::DebeziumJson, + Format::Json(_) => Self::StandardJson, + Format::Avro(_) => Self::Avro, + Format::Parquet(_) => Self::Parquet, + Format::Protobuf(_) | Format::RawString(_) | Format::RawBytes(_) => Self::Raw, + } + } + + pub fn supports_delta_updates(&self) -> bool { + matches!(self, Self::DebeziumJson) + } + + pub fn apply_envelope(self, columns: Vec) -> Result> { + if !self.supports_delta_updates() { + return Ok(columns); + } + if columns.iter().any(|c| c.is_computed()) { + return plan_err!("Virtual fields are not supported with CDC envelope"); + } + if columns.is_empty() { + return Ok(columns); + } + let fields: Vec = columns.into_iter().map(|c| c.into_arrow_field()).collect(); + let struct_type = DataType::Struct(fields.into()); + + Ok(vec![ + ColumnDescriptor::new_physical(Field::new(cdc::BEFORE, struct_type.clone(), true)), + ColumnDescriptor::new_physical(Field::new(cdc::AFTER, struct_type.clone(), true)), + ColumnDescriptor::new_physical(Field::new(cdc::OP, DataType::Utf8, true)), + ]) + } +} diff --git a/src/sql/schema/kafka_operator_config.rs b/src/sql/schema/kafka_operator_config.rs new file mode 100644 index 00000000..d9251310 --- /dev/null +++ b/src/sql/schema/kafka_operator_config.rs @@ -0,0 +1,263 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// +// Builds strongly-typed proto Kafka configs from SQL DDL WITH options. + +use std::collections::HashMap; + +use datafusion::arrow::datatypes::Schema; +use datafusion::common::{Result as DFResult, plan_datafusion_err, plan_err}; + +use protocol::function_stream_graph::connector_op::Config as ProtoConfig; +use protocol::function_stream_graph::{ + BadDataPolicy, DecimalEncodingProto, FormatConfig, JsonFormatConfig, KafkaAuthConfig, + KafkaAuthNone, KafkaOffsetMode, KafkaReadMode, KafkaSinkCommitMode, KafkaSinkConfig, + KafkaSourceConfig, RawBytesFormatConfig, RawStringFormatConfig, TimestampFormatProto, +}; + +use crate::sql::common::connector_options::ConnectorOptions; +use crate::sql::common::constants::{connection_table_role, kafka_with_value}; +use crate::sql::common::formats::{ + BadData, DecimalEncoding as SqlDecimalEncoding, Format as SqlFormat, + TimestampFormat as SqlTimestampFormat, +}; +use crate::sql::common::with_option_keys as opt; +use crate::sql::schema::table_role::TableRole; + +fn sql_format_to_proto(fmt: &SqlFormat) -> DFResult { + match fmt { + SqlFormat::Json(j) => Ok(FormatConfig { + format: Some( + protocol::function_stream_graph::format_config::Format::Json(JsonFormatConfig { + timestamp_format: match j.timestamp_format { + SqlTimestampFormat::RFC3339 => { + TimestampFormatProto::TimestampRfc3339 as i32 + } + SqlTimestampFormat::UnixMillis => { + TimestampFormatProto::TimestampUnixMillis as i32 + } + }, + decimal_encoding: match j.decimal_encoding { + SqlDecimalEncoding::Number => DecimalEncodingProto::DecimalNumber as i32, + SqlDecimalEncoding::String => DecimalEncodingProto::DecimalString as i32, + SqlDecimalEncoding::Bytes => DecimalEncodingProto::DecimalBytes as i32, + }, + include_schema: j.include_schema, + confluent_schema_registry: j.confluent_schema_registry, + schema_id: j.schema_id, + debezium: j.debezium, + unstructured: j.unstructured, + }), + ), + }), + SqlFormat::RawString(_) => Ok(FormatConfig { + format: Some( + protocol::function_stream_graph::format_config::Format::RawString( + RawStringFormatConfig {}, + ), + ), + }), + SqlFormat::RawBytes(_) => Ok(FormatConfig { + format: Some( + protocol::function_stream_graph::format_config::Format::RawBytes( + RawBytesFormatConfig {}, + ), + ), + }), + other => plan_err!( + "Kafka connector: format '{}' is not supported yet", + other.name() + ), + } +} + +fn sql_bad_data_to_proto(bad: &BadData) -> i32 { + match bad { + BadData::Fail {} => BadDataPolicy::BadDataFail as i32, + BadData::Drop {} => BadDataPolicy::BadDataDrop as i32, + } +} + +/// Build Kafka proto config from a flat string map (catalog rebuild path). +pub fn build_kafka_proto_config_from_string_map( + map: HashMap, + _physical_schema: &Schema, +) -> DFResult { + let mut options = ConnectorOptions::from_flat_string_map(map)?; + let format = crate::sql::common::formats::Format::from_opts(&mut options) + .map_err(|e| datafusion::error::DataFusionError::Plan(format!("invalid format: {e}")))?; + let bad_data = BadData::from_opts(&mut options).map_err(|e| { + datafusion::error::DataFusionError::Plan(format!("Invalid bad_data: '{e}'")) + })?; + let _framing = crate::sql::common::formats::Framing::from_opts(&mut options) + .map_err(|e| datafusion::error::DataFusionError::Plan(format!("invalid framing: '{e}'")))?; + + let role = match options.pull_opt_str(opt::TYPE)?.as_deref() { + None | Some(connection_table_role::SOURCE) => TableRole::Ingestion, + Some(connection_table_role::SINK) => TableRole::Egress, + Some(connection_table_role::LOOKUP) => TableRole::Reference, + Some(other) => { + return plan_err!("invalid connection type '{other}' in WITH options"); + } + }; + + build_kafka_proto_config(&mut options, role, &format, bad_data) +} + +/// Core builder shared by SQL DDL and catalog reload paths. +pub fn build_kafka_proto_config( + options: &mut ConnectorOptions, + role: TableRole, + format: &Option, + bad_data: BadData, +) -> DFResult { + let bootstrap_servers = match options.pull_opt_str(opt::KAFKA_BOOTSTRAP_SERVERS)? { + Some(s) => s, + None => options + .pull_opt_str(opt::KAFKA_BOOTSTRAP_SERVERS_LEGACY)? + .ok_or_else(|| { + plan_datafusion_err!( + "Kafka connector requires 'bootstrap.servers' in the WITH clause" + ) + })?, + }; + + let topic = options.pull_opt_str(opt::KAFKA_TOPIC)?.ok_or_else(|| { + plan_datafusion_err!("Kafka connector requires 'topic' in the WITH clause") + })?; + + let sql_format = format.clone().ok_or_else(|| { + plan_datafusion_err!( + "Kafka connector requires 'format' in the WITH clause (e.g. format = 'json')" + ) + })?; + let proto_format = sql_format_to_proto(&sql_format)?; + + let rate_limit = options + .pull_opt_u64(opt::KAFKA_RATE_LIMIT_MESSAGES_PER_SECOND)? + .map(|v| v.clamp(1, u32::MAX as u64) as u32) + .unwrap_or(0); + + let value_subject = options.pull_opt_str(opt::KAFKA_VALUE_SUBJECT)?; + + let auth = Some(KafkaAuthConfig { + auth: Some( + protocol::function_stream_graph::kafka_auth_config::Auth::None(KafkaAuthNone {}), + ), + }); + + let _ = options.pull_opt_str(opt::TYPE)?; + let _ = options.pull_opt_str(opt::CONNECTOR)?; + + match role { + TableRole::Ingestion => { + let offset_mode = match options + .pull_opt_str(opt::KAFKA_SCAN_STARTUP_MODE)? + .as_deref() + { + Some(s) if s == kafka_with_value::SCAN_LATEST => { + KafkaOffsetMode::KafkaOffsetLatest as i32 + } + Some(s) if s == kafka_with_value::SCAN_EARLIEST => { + KafkaOffsetMode::KafkaOffsetEarliest as i32 + } + Some(s) + if s == kafka_with_value::SCAN_GROUP_OFFSETS + || s == kafka_with_value::SCAN_GROUP => + { + KafkaOffsetMode::KafkaOffsetGroup as i32 + } + None => KafkaOffsetMode::KafkaOffsetGroup as i32, + Some(other) => { + return plan_err!( + "invalid scan.startup.mode '{other}'; expected latest, earliest, or group-offsets" + ); + } + }; + + let read_mode = match options.pull_opt_str(opt::KAFKA_ISOLATION_LEVEL)?.as_deref() { + Some(s) if s == kafka_with_value::ISOLATION_READ_COMMITTED => { + KafkaReadMode::KafkaReadCommitted as i32 + } + Some(s) if s == kafka_with_value::ISOLATION_READ_UNCOMMITTED => { + KafkaReadMode::KafkaReadUncommitted as i32 + } + None => KafkaReadMode::KafkaReadDefault as i32, + Some(other) => { + return plan_err!("invalid isolation.level '{other}'"); + } + }; + + let group_id = match options.pull_opt_str(opt::KAFKA_GROUP_ID)? { + Some(s) => Some(s), + None => options.pull_opt_str(opt::KAFKA_GROUP_ID_LEGACY)?, + }; + let group_id_prefix = options.pull_opt_str(opt::KAFKA_GROUP_ID_PREFIX)?; + + let client_configs = options.drain_remaining_string_values()?; + + Ok(ProtoConfig::KafkaSource(KafkaSourceConfig { + topic, + bootstrap_servers, + group_id, + group_id_prefix, + offset_mode, + read_mode, + auth, + client_configs, + format: Some(proto_format), + bad_data_policy: sql_bad_data_to_proto(&bad_data), + rate_limit_msgs_per_sec: rate_limit, + value_subject, + })) + } + TableRole::Egress => { + let commit_mode = match options + .pull_opt_str(opt::KAFKA_SINK_COMMIT_MODE)? + .as_deref() + { + Some(s) + if s == kafka_with_value::SINK_COMMIT_EXACTLY_ONCE_HYPHEN + || s == kafka_with_value::SINK_COMMIT_EXACTLY_ONCE_UNDERSCORE => + { + KafkaSinkCommitMode::KafkaSinkExactlyOnce as i32 + } + None => KafkaSinkCommitMode::KafkaSinkAtLeastOnce as i32, + Some(s) + if s == kafka_with_value::SINK_COMMIT_AT_LEAST_ONCE_HYPHEN + || s == kafka_with_value::SINK_COMMIT_AT_LEAST_ONCE_UNDERSCORE => + { + KafkaSinkCommitMode::KafkaSinkAtLeastOnce as i32 + } + Some(other) => { + return plan_err!("invalid sink.commit.mode '{other}'"); + } + }; + let key_field = match options.pull_opt_str(opt::KAFKA_SINK_KEY_FIELD)? { + Some(s) => Some(s), + None => options.pull_opt_str(opt::KAFKA_KEY_FIELD_LEGACY)?, + }; + let timestamp_field = match options.pull_opt_str(opt::KAFKA_SINK_TIMESTAMP_FIELD)? { + Some(s) => Some(s), + None => options.pull_opt_str(opt::KAFKA_TIMESTAMP_FIELD_LEGACY)?, + }; + + let client_configs = options.drain_remaining_string_values()?; + + Ok(ProtoConfig::KafkaSink(KafkaSinkConfig { + topic, + bootstrap_servers, + commit_mode, + key_field, + timestamp_field, + auth, + client_configs, + format: Some(proto_format), + value_subject, + })) + } + TableRole::Reference => { + plan_err!("Kafka connector cannot be used as a lookup table in this path") + } + } +} diff --git a/src/sql/schema/mod.rs b/src/sql/schema/mod.rs new file mode 100644 index 00000000..be3cdda4 --- /dev/null +++ b/src/sql/schema/mod.rs @@ -0,0 +1,39 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod catalog_ddl; +pub mod column_descriptor; +pub mod connection_type; +pub mod connector_config; +pub mod data_encoding_format; +pub mod kafka_operator_config; +pub mod schema_context; +pub mod schema_provider; +pub mod source_table; +pub mod table; +pub mod table_execution_unit; +pub mod table_role; +pub mod temporal_pipeline_config; +pub mod utils; + +pub use catalog_ddl::{ + catalog_table_row_detail, schema_columns_one_line, show_create_catalog_table, +}; +pub use column_descriptor::ColumnDescriptor; +pub use connection_type::ConnectionType; +pub use connector_config::ConnectorConfig; +pub use source_table::SourceTable; + +/// Back-compat alias for [`SourceTable`]. +pub type ConnectorTable = SourceTable; +pub use schema_provider::{ObjectName, StreamPlanningContext, StreamSchemaProvider, StreamTable}; +pub use table::Table; diff --git a/src/sql/schema/schema_context.rs b/src/sql/schema/schema_context.rs new file mode 100644 index 00000000..851bf6af --- /dev/null +++ b/src/sql/schema/schema_context.rs @@ -0,0 +1,37 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion::arrow::datatypes::{DataType, Schema}; +use datafusion::common::{DFSchema, Result}; +use datafusion::logical_expr::Expr; +use datafusion_expr::ExprSchemable; + +pub trait SchemaContext { + fn resolve_expression(&self, expr: &Expr, schema: &Schema) -> Result; + fn extract_datatype(&self, expr: &Expr, schema: &Schema) -> Result; +} + +/// [`SchemaContext`] backed by a [`DFSchema`] built from the physical Arrow schema. +pub struct DfSchemaContext; + +impl SchemaContext for DfSchemaContext { + fn resolve_expression(&self, expr: &Expr, schema: &Schema) -> Result { + let df = DFSchema::try_from(schema.clone())?; + let _ = expr.get_type(&df)?; + Ok(expr.clone()) + } + + fn extract_datatype(&self, expr: &Expr, schema: &Schema) -> Result { + let df = DFSchema::try_from(schema.clone())?; + expr.get_type(&df) + } +} diff --git a/src/sql/schema/schema_provider.rs b/src/sql/schema/schema_provider.rs new file mode 100644 index 00000000..d5405dd2 --- /dev/null +++ b/src/sql/schema/schema_provider.rs @@ -0,0 +1,423 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::{BTreeMap, HashMap, HashSet}; +use std::sync::Arc; + +use datafusion::arrow::datatypes::{self as datatypes, DataType, Field, Schema}; +use datafusion::common::{DataFusionError, Result}; +use datafusion::datasource::{DefaultTableSource, TableProvider, TableType}; +use datafusion::execution::{FunctionRegistry, SessionStateDefaults}; +use datafusion::logical_expr::expr_rewriter::FunctionRewrite; +use datafusion::logical_expr::planner::ExprPlanner; +use datafusion::logical_expr::{AggregateUDF, Expr, ScalarUDF, TableSource, WindowUDF}; +use datafusion::optimizer::Analyzer; +use datafusion::sql::TableReference; +use datafusion::sql::planner::ContextProvider; +use unicase::UniCase; + +use crate::sql::common::constants::{planning_placeholder_udf, window_fn}; +use crate::sql::logical_node::logical::{DylibUdfConfig, LogicalProgram}; +use crate::sql::schema::table::Table as CatalogTable; +use crate::sql::schema::utils::window_arrow_struct; +use crate::sql::types::{PlanningOptions, PlanningPlaceholderUdf}; + +pub type ObjectName = UniCase; + +#[inline] +fn object_name(s: impl Into) -> ObjectName { + UniCase::new(s.into()) +} + +#[derive(Clone, Debug)] +pub enum StreamTable { + Source { + name: String, + connector: String, + schema: Arc, + event_time_field: Option, + watermark_field: Option, + /// Persisted `WITH` options for `SHOW CREATE TABLE`. + with_options: BTreeMap, + }, + Sink { + name: String, + program: LogicalProgram, + }, +} + +impl StreamTable { + pub fn name(&self) -> &str { + match self { + Self::Source { name, .. } | Self::Sink { name, .. } => name, + } + } + + pub fn schema(&self) -> Arc { + match self { + Self::Source { schema, .. } => Arc::clone(schema), + Self::Sink { program, .. } => program + .egress_arrow_schema() + .unwrap_or_else(|| Arc::new(Schema::empty())), + } + } +} + +#[derive(Debug, Clone)] +pub struct LogicalBatchInput { + pub table_name: String, + pub schema: Arc, +} + +#[async_trait::async_trait] +impl TableProvider for LogicalBatchInput { + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn schema(&self) -> Arc { + Arc::clone(&self.schema) + } + + fn table_type(&self) -> TableType { + TableType::Temporary + } + + async fn scan( + &self, + _state: &dyn datafusion::catalog::Session, + _projection: Option<&Vec>, + _filters: &[Expr], + _limit: Option, + ) -> Result> { + Ok(Arc::new(crate::sql::physical::FsMemExec::new( + self.table_name.clone(), + Arc::clone(&self.schema), + ))) + } +} + +#[derive(Clone, Default)] +pub struct FunctionCatalog { + pub scalars: HashMap>, + pub aggregates: HashMap>, + pub windows: HashMap>, + pub planners: Vec>, +} + +#[derive(Clone, Default)] +pub struct TableCatalog { + pub streams: HashMap>, + pub catalogs: HashMap>, + pub source_defs: HashMap, +} + +#[derive(Clone, Default)] +pub struct StreamPlanningContext { + pub tables: TableCatalog, + pub functions: FunctionCatalog, + pub dylib_udfs: HashMap, + pub config_options: datafusion::config::ConfigOptions, + pub planning_options: PlanningOptions, + pub analyzer: Analyzer, +} + +/// Back-compat name for [`StreamPlanningContext`]. +pub type StreamSchemaProvider = StreamPlanningContext; + +impl StreamPlanningContext { + pub fn builder() -> StreamPlanningContextBuilder { + StreamPlanningContextBuilder::default() + } + + /// Same registration order as the historical `StreamSchemaProvider::new` (placeholders, then DataFusion defaults). + pub fn new() -> Self { + Self::builder() + .with_streaming_extensions() + .expect("streaming extensions") + .with_default_functions() + .expect("default functions") + .build() + } + + pub fn register_stream_table(&mut self, table: StreamTable) { + let key = object_name(table.name().to_string()); + self.tables.streams.insert(key, Arc::new(table)); + } + + pub fn get_stream_table(&self, name: &str) -> Option> { + self.tables + .streams + .get(&object_name(name.to_string())) + .cloned() + } + + pub fn register_catalog_table(&mut self, table: CatalogTable) { + let key = object_name(table.name().to_string()); + self.tables.catalogs.insert(key, Arc::new(table)); + } + + pub fn get_catalog_table(&self, table_name: impl AsRef) -> Option<&CatalogTable> { + self.tables + .catalogs + .get(&object_name(table_name.as_ref().to_string())) + .map(|t| t.as_ref()) + } + + pub fn get_catalog_table_mut( + &mut self, + table_name: impl AsRef, + ) -> Option<&mut CatalogTable> { + self.tables + .catalogs + .get_mut(&object_name(table_name.as_ref().to_string())) + .map(Arc::make_mut) + } + + pub fn add_source_table( + &mut self, + name: String, + schema: Arc, + event_time_field: Option, + watermark_field: Option, + ) { + self.register_stream_table(StreamTable::Source { + name, + connector: "stream_catalog".to_string(), + schema, + event_time_field, + watermark_field, + with_options: BTreeMap::new(), + }); + } + + pub fn add_sink_table(&mut self, name: String, program: LogicalProgram) { + self.register_stream_table(StreamTable::Sink { name, program }); + } + + pub fn insert_table(&mut self, table: StreamTable) { + self.register_stream_table(table); + } + + /// Alias for [`Self::register_catalog_table`]. + pub fn insert_catalog_table(&mut self, table: CatalogTable) { + self.register_catalog_table(table); + } + + pub fn get_table(&self, table_name: impl AsRef) -> Option<&StreamTable> { + self.tables + .streams + .get(&object_name(table_name.as_ref().to_string())) + .map(|a| a.as_ref()) + } + + pub fn get_table_mut(&mut self, table_name: impl AsRef) -> Option<&mut StreamTable> { + self.tables + .streams + .get_mut(&object_name(table_name.as_ref().to_string())) + .map(Arc::make_mut) + } + + pub fn get_async_udf_options(&self, _name: &str) -> Option { + None + } + + fn create_table_source(name: String, schema: Arc) -> Arc { + let provider = LogicalBatchInput { + table_name: name, + schema, + }; + Arc::new(DefaultTableSource::new(Arc::new(provider))) + } +} + +impl ContextProvider for StreamPlanningContext { + fn get_table_source(&self, name: TableReference) -> Result> { + let table = self + .get_stream_table(name.table()) + .ok_or_else(|| DataFusionError::Plan(format!("Table {} not found", name)))?; + + Ok(Self::create_table_source(name.to_string(), table.schema())) + } + + fn get_function_meta(&self, name: &str) -> Option> { + self.functions.scalars.get(name).cloned() + } + + fn get_aggregate_meta(&self, name: &str) -> Option> { + self.functions.aggregates.get(name).cloned() + } + + fn get_window_meta(&self, name: &str) -> Option> { + self.functions.windows.get(name).cloned() + } + + fn get_variable_type(&self, _variable_names: &[String]) -> Option { + None + } + + fn options(&self) -> &datafusion::config::ConfigOptions { + &self.config_options + } + + fn udf_names(&self) -> Vec { + self.functions.scalars.keys().cloned().collect() + } + + fn udaf_names(&self) -> Vec { + self.functions.aggregates.keys().cloned().collect() + } + + fn udwf_names(&self) -> Vec { + self.functions.windows.keys().cloned().collect() + } + + fn get_expr_planners(&self) -> &[Arc] { + &self.functions.planners + } +} + +impl FunctionRegistry for StreamPlanningContext { + fn udfs(&self) -> HashSet { + self.functions.scalars.keys().cloned().collect() + } + + fn udf(&self, name: &str) -> Result> { + self.functions + .scalars + .get(name) + .cloned() + .ok_or_else(|| DataFusionError::Plan(format!("No UDF with name {name}"))) + } + + fn udaf(&self, name: &str) -> Result> { + self.functions + .aggregates + .get(name) + .cloned() + .ok_or_else(|| DataFusionError::Plan(format!("No UDAF with name {name}"))) + } + + fn udwf(&self, name: &str) -> Result> { + self.functions + .windows + .get(name) + .cloned() + .ok_or_else(|| DataFusionError::Plan(format!("No UDWF with name {name}"))) + } + + fn register_function_rewrite( + &mut self, + rewrite: Arc, + ) -> Result<()> { + self.analyzer.add_function_rewrite(rewrite); + Ok(()) + } + + fn register_udf(&mut self, udf: Arc) -> Result>> { + Ok(self.functions.scalars.insert(udf.name().to_string(), udf)) + } + + fn register_udaf(&mut self, udaf: Arc) -> Result>> { + Ok(self + .functions + .aggregates + .insert(udaf.name().to_string(), udaf)) + } + + fn register_udwf(&mut self, udwf: Arc) -> Result>> { + Ok(self.functions.windows.insert(udwf.name().to_string(), udwf)) + } + + fn register_expr_planner(&mut self, expr_planner: Arc) -> Result<()> { + self.functions.planners.push(expr_planner); + Ok(()) + } + + fn expr_planners(&self) -> Vec> { + self.functions.planners.clone() + } +} + +#[derive(Default)] +pub struct StreamPlanningContextBuilder { + context: StreamPlanningContext, +} + +impl StreamPlanningContextBuilder { + pub fn new() -> Self { + Self::default() + } + + pub fn with_default_functions(mut self) -> Result { + for p in SessionStateDefaults::default_scalar_functions() { + self.context.register_udf(p)?; + } + for p in SessionStateDefaults::default_aggregate_functions() { + self.context.register_udaf(p)?; + } + for p in SessionStateDefaults::default_window_functions() { + self.context.register_udwf(p)?; + } + for p in SessionStateDefaults::default_expr_planners() { + self.context.register_expr_planner(p)?; + } + Ok(self) + } + + pub fn with_streaming_extensions(mut self) -> Result { + let extensions = vec![ + PlanningPlaceholderUdf::new_with_return( + window_fn::HOP, + vec![ + DataType::Interval(datatypes::IntervalUnit::MonthDayNano), + DataType::Interval(datatypes::IntervalUnit::MonthDayNano), + ], + window_arrow_struct(), + ), + PlanningPlaceholderUdf::new_with_return( + window_fn::TUMBLE, + vec![DataType::Interval(datatypes::IntervalUnit::MonthDayNano)], + window_arrow_struct(), + ), + PlanningPlaceholderUdf::new_with_return( + window_fn::SESSION, + vec![DataType::Interval(datatypes::IntervalUnit::MonthDayNano)], + window_arrow_struct(), + ), + PlanningPlaceholderUdf::new_with_return( + planning_placeholder_udf::UNNEST, + vec![DataType::List(Arc::new(Field::new( + planning_placeholder_udf::LIST_ELEMENT_FIELD, + DataType::Utf8, + true, + )))], + DataType::Utf8, + ), + PlanningPlaceholderUdf::new_with_return( + planning_placeholder_udf::ROW_TIME, + vec![], + DataType::Timestamp(datatypes::TimeUnit::Nanosecond, None), + ), + ]; + + for ext in extensions { + self.context.register_udf(ext)?; + } + + Ok(self) + } + + pub fn build(self) -> StreamPlanningContext { + self.context + } +} diff --git a/src/sql/schema/source_table.rs b/src/sql/schema/source_table.rs new file mode 100644 index 00000000..03eda9ac --- /dev/null +++ b/src/sql/schema/source_table.rs @@ -0,0 +1,592 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::{BTreeMap, HashMap}; +use std::sync::Arc; +use std::time::Duration; + +use datafusion::arrow::datatypes::{DataType, Field, FieldRef, Schema}; +use datafusion::common::{Column, DFSchema, Result, plan_datafusion_err, plan_err}; +use datafusion::error::DataFusionError; +use datafusion::logical_expr::Expr; +use datafusion::sql::TableReference; +use datafusion::sql::planner::{PlannerContext, SqlToRel}; +use datafusion::sql::sqlparser::ast; +use datafusion_expr::ExprSchemable; +use protocol::function_stream_graph::ConnectorOp; +use tracing::warn; + +use super::StreamSchemaProvider; +use super::column_descriptor::ColumnDescriptor; +use super::connector_config::ConnectorConfig; +use super::data_encoding_format::DataEncodingFormat; +use super::schema_context::SchemaContext; +use super::table_execution_unit::{EngineDescriptor, SyncMode, TableExecutionUnit}; +use super::table_role::{ + TableRole, apply_adapter_specific_rules, deduce_role, serialize_backend_params, + validate_adapter_availability, +}; +use super::temporal_pipeline_config::{ + TemporalPipelineConfig, TemporalSpec, resolve_temporal_logic, +}; +use crate::multifield_partial_ord; +use crate::sql::api::ConnectionProfile; +use crate::sql::common::connector_options::ConnectorOptions; +use crate::sql::common::constants::{connection_table_role, connector_type, sql_field}; +use crate::sql::common::with_option_keys as opt; +use crate::sql::common::{BadData, Format, Framing, FsSchema, JsonCompression, JsonFormat}; +use crate::sql::schema::ConnectionType; +use crate::sql::schema::kafka_operator_config::build_kafka_proto_config; +use crate::sql::schema::table::SqlSource; +use crate::sql::types::ProcessingMode; + +/// Connector-backed catalog table (adapter / source-sink model). +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct SourceTable { + pub registry_id: Option, + pub adapter_type: String, + pub table_identifier: String, + pub role: TableRole, + pub schema_specs: Vec, + /// Strongly-typed connector runtime configuration — replaces the legacy `opaque_config: String`. + pub connector_config: ConnectorConfig, + pub temporal_config: TemporalPipelineConfig, + pub key_constraints: Vec, + pub payload_format: Option, + /// Wire [`Format`] when built from SQL `WITH` (updating mode, `ConnectionSchema`). + pub connection_format: Option, + pub description: String, + pub partition_exprs: Arc>>, + pub lookup_cache_max_bytes: Option, + pub lookup_cache_ttl: Option, + pub inferred_fields: Option>, + /// Original `WITH` options for catalog persistence / `SHOW CREATE TABLE`. + pub catalog_with_options: BTreeMap, +} + +multifield_partial_ord!( + SourceTable, + registry_id, + adapter_type, + table_identifier, + role, + description, + key_constraints, + connection_format, + catalog_with_options +); + +impl SourceTable { + #[inline] + pub fn name(&self) -> &str { + self.table_identifier.as_str() + } + + pub fn new( + table_identifier: impl Into, + connector: impl Into, + connection_type: ConnectionType, + ) -> Self { + Self { + registry_id: None, + adapter_type: connector.into(), + table_identifier: table_identifier.into(), + role: connection_type.into(), + schema_specs: Vec::new(), + connector_config: ConnectorConfig::Generic(HashMap::new()), + temporal_config: TemporalPipelineConfig::default(), + key_constraints: Vec::new(), + payload_format: None, + connection_format: None, + description: String::new(), + partition_exprs: Arc::new(None), + lookup_cache_max_bytes: None, + lookup_cache_ttl: None, + inferred_fields: None, + catalog_with_options: BTreeMap::new(), + } + } + + #[inline] + pub fn connector(&self) -> &str { + self.adapter_type.as_str() + } + + #[inline] + pub fn connection_type(&self) -> ConnectionType { + self.role.into() + } + + pub fn event_time_field(&self) -> Option<&str> { + self.temporal_config.event_column.as_deref() + } + + pub fn watermark_field(&self) -> Option<&str> { + self.temporal_config.watermark_strategy_column.as_deref() + } + + /// Watermark column name safe to persist for [`StreamTable::Source`]. Omits the computed + /// [`sql_field::COMPUTED_WATERMARK`] column: stream catalog only stores Arrow physical fields, + /// so `__watermark` cannot be resolved when the table is planned from the catalog. + pub fn stream_catalog_watermark_field(&self) -> Option { + self.temporal_config + .watermark_strategy_column + .as_deref() + .filter(|w| *w != sql_field::COMPUTED_WATERMARK) + .map(str::to_string) + } + + #[inline] + pub fn catalog_with_options(&self) -> &BTreeMap { + &self.catalog_with_options + } + + pub fn idle_time(&self) -> Option { + self.temporal_config.liveness_timeout + } + + pub fn initialize_from_params( + id: &str, + adapter: &str, + raw_columns: Vec, + pk_list: Vec, + time_meta: Option, + options: &mut HashMap, + _schema_ctx: &dyn SchemaContext, + ) -> Result { + validate_adapter_availability(adapter)?; + + let catalog_with_options: BTreeMap = options + .iter() + .map(|(k, v)| (k.clone(), v.clone())) + .collect(); + + let encoding = DataEncodingFormat::extract_from_map(options)?; + + let mut refined_columns = apply_adapter_specific_rules(adapter, raw_columns); + refined_columns = encoding.apply_envelope(refined_columns)?; + + let temporal_settings = resolve_temporal_logic(&refined_columns, time_meta)?; + let _finalized_config = serialize_backend_params(adapter, options)?; + let role = deduce_role(options)?; + + if role == TableRole::Ingestion && encoding.supports_delta_updates() && pk_list.is_empty() { + return plan_err!("CDC source requires at least one primary key"); + } + + Ok(Self { + registry_id: None, + adapter_type: adapter.to_string(), + table_identifier: id.to_string(), + role, + schema_specs: refined_columns, + connector_config: ConnectorConfig::Generic( + catalog_with_options.clone().into_iter().collect(), + ), + temporal_config: temporal_settings, + key_constraints: pk_list, + payload_format: Some(encoding), + connection_format: None, + description: String::new(), + partition_exprs: Arc::new(None), + lookup_cache_max_bytes: None, + lookup_cache_ttl: None, + inferred_fields: None, + catalog_with_options, + }) + } + + pub fn produce_physical_schema(&self) -> Schema { + Schema::new( + self.schema_specs + .iter() + .filter(|c| !c.is_computed()) + .map(|c| c.arrow_field().clone()) + .collect::>(), + ) + } + + #[inline] + pub fn physical_schema(&self) -> Schema { + self.produce_physical_schema() + } + + pub fn convert_to_execution_unit(&self) -> Result { + if self.role == TableRole::Egress { + return plan_err!("Target [{}] is write-only", self.table_identifier); + } + + if self.is_cdc_enabled() && self.schema_specs.iter().any(|c| c.is_computed()) { + return plan_err!("CDC cannot be mixed with computed columns natively"); + } + + let mode = if self.is_cdc_enabled() { + SyncMode::Incremental + } else { + SyncMode::AppendOnly + }; + + Ok(TableExecutionUnit { + label: self.table_identifier.clone(), + engine_meta: EngineDescriptor { + engine_type: self.adapter_type.clone(), + raw_payload: String::new(), + }, + sync_mode: mode, + temporal_offset: self.temporal_config.clone(), + }) + } + + #[inline] + pub fn to_execution_unit(&self) -> Result { + self.convert_to_execution_unit() + } + + fn is_cdc_enabled(&self) -> bool { + self.payload_format + .as_ref() + .is_some_and(|f| f.supports_delta_updates()) + } + + #[allow(clippy::too_many_arguments)] + pub fn from_options( + table_identifier: &str, + connector_name: &str, + temporary: bool, + fields: Vec, + primary_keys: Vec, + watermark: Option<(String, Option)>, + options: &mut ConnectorOptions, + connection_profile: Option<&ConnectionProfile>, + schema_provider: &StreamSchemaProvider, + connection_type_override: Option, + description: String, + ) -> Result { + let _ = connection_profile; + + let catalog_with_options = options.snapshot_for_catalog(); + + if let Some(c) = options.pull_opt_str(opt::CONNECTOR)? + && c != connector_name + { + return plan_err!( + "WITH option `connector` is '{c}' but table uses connector '{connector_name}'" + ); + } + + validate_adapter_availability(connector_name)?; + + let mut columns = fields; + columns = apply_adapter_specific_rules(connector_name, columns); + + let format = Format::from_opts(options) + .map_err(|e| DataFusionError::Plan(format!("invalid format: '{e}'")))?; + + if let Some(Format::Json(JsonFormat { compression, .. })) = &format + && !matches!(compression, JsonCompression::Uncompressed) + && connector_name != connector_type::FILESYSTEM + { + return plan_err!("'json.compression' is only supported for the filesystem connector"); + } + + let _framing = Framing::from_opts(options) + .map_err(|e| DataFusionError::Plan(format!("invalid framing: '{e}'")))?; + + if temporary + && let Some(t) = options.insert_str(opt::TYPE, connection_table_role::LOOKUP)? + && t != connection_table_role::LOOKUP + { + return plan_err!( + "Cannot have a temporary table with type '{t}'; temporary tables must be type 'lookup'" + ); + } + + let payload_format = format + .as_ref() + .map(DataEncodingFormat::from_connection_format); + let encoding = payload_format.unwrap_or(DataEncodingFormat::Raw); + columns = encoding.apply_envelope(columns)?; + + let bad_data = BadData::from_opts(options) + .map_err(|e| DataFusionError::Plan(format!("Invalid bad_data: '{e}'")))?; + + let role = if let Some(t) = connection_type_override { + t.into() + } else { + match options.pull_opt_str(opt::TYPE)?.as_deref() { + None | Some(connection_table_role::SOURCE) => TableRole::Ingestion, + Some(connection_table_role::SINK) => TableRole::Egress, + Some(connection_table_role::LOOKUP) => TableRole::Reference, + Some(other) => { + return plan_err!("invalid connection type '{other}' in WITH options"); + } + } + }; + + let mut table = SourceTable { + registry_id: None, + adapter_type: connector_name.to_string(), + table_identifier: table_identifier.to_string(), + role, + schema_specs: columns, + connector_config: ConnectorConfig::Generic(HashMap::new()), + temporal_config: TemporalPipelineConfig::default(), + key_constraints: Vec::new(), + payload_format, + connection_format: format.clone(), + description, + partition_exprs: Arc::new(None), + lookup_cache_max_bytes: None, + lookup_cache_ttl: None, + inferred_fields: None, + catalog_with_options, + }; + + if let Some(event_time_field) = options.pull_opt_field(opt::EVENT_TIME_FIELD)? { + warn!("`event_time_field` WITH option is deprecated; use WATERMARK FOR syntax"); + table.temporal_config.event_column = Some(event_time_field); + } + + if let Some(watermark_field) = options.pull_opt_field(opt::WATERMARK_FIELD)? { + warn!("`watermark_field` WITH option is deprecated; use WATERMARK FOR syntax"); + table.temporal_config.watermark_strategy_column = Some(watermark_field); + } + + if let Some((time_field, watermark_expr)) = watermark { + let field = table + .schema_specs + .iter() + .find(|c| c.arrow_field().name().as_str() == time_field.as_str()) + .ok_or_else(|| { + plan_datafusion_err!( + "WATERMARK FOR field `{}` does not exist in table", + time_field + ) + })?; + + if !matches!( + field.arrow_field().data_type(), + DataType::Timestamp(_, None) + ) { + return plan_err!( + "WATERMARK FOR field `{time_field}` has type {}, but expected TIMESTAMP", + field.arrow_field().data_type() + ); + } + + for col in table.schema_specs.iter_mut() { + if col.arrow_field().name().as_str() == time_field.as_str() { + col.set_nullable(false); + break; + } + } + + let table_ref = TableReference::bare(table.table_identifier.as_str()); + let df_schema = + DFSchema::try_from_qualified_schema(table_ref, &table.produce_physical_schema())?; + + table.temporal_config.event_column = Some(time_field.clone()); + + if let Some(expr) = watermark_expr { + let logical_expr = plan_generating_expr(&expr, &df_schema, schema_provider) + .map_err(|e| { + DataFusionError::Plan(format!("could not plan watermark expression: {e}")) + })?; + + let (data_type, _nullable) = logical_expr.data_type_and_nullable(&df_schema)?; + if !matches!(data_type, DataType::Timestamp(_, _)) { + return plan_err!( + "the type of the WATERMARK FOR expression must be TIMESTAMP, but was {data_type}" + ); + } + + table.schema_specs.push(ColumnDescriptor::new_computed( + Field::new( + sql_field::COMPUTED_WATERMARK, + logical_expr.get_type(&df_schema)?, + false, + ), + logical_expr, + )); + table.temporal_config.watermark_strategy_column = + Some(sql_field::COMPUTED_WATERMARK.to_string()); + } else { + table.temporal_config.watermark_strategy_column = Some(time_field); + } + } + + let idle_from_micros = options + .pull_opt_i64(opt::IDLE_MICROS)? + .filter(|t| *t > 0) + .map(|t| Duration::from_micros(t as u64)); + let idle_from_duration = options.pull_opt_duration(opt::IDLE_TIME)?; + table.temporal_config.liveness_timeout = idle_from_micros.or(idle_from_duration); + + table.lookup_cache_max_bytes = options.pull_opt_u64(opt::LOOKUP_CACHE_MAX_BYTES)?; + + table.lookup_cache_ttl = options.pull_opt_duration(opt::LOOKUP_CACHE_TTL)?; + + if connector_name.eq_ignore_ascii_case(connector_type::KAFKA) { + let proto_cfg = build_kafka_proto_config(options, role, &format, bad_data)?; + table.connector_config = match proto_cfg { + protocol::function_stream_graph::connector_op::Config::KafkaSource(cfg) => { + ConnectorConfig::KafkaSource(cfg) + } + protocol::function_stream_graph::connector_op::Config::KafkaSink(cfg) => { + ConnectorConfig::KafkaSink(cfg) + } + protocol::function_stream_graph::connector_op::Config::Generic(g) => { + ConnectorConfig::Generic(g.properties) + } + }; + } else { + let extra_opts = options.drain_remaining_string_values()?; + table.connector_config = ConnectorConfig::Generic(extra_opts); + } + + if role == TableRole::Ingestion + && encoding.supports_delta_updates() + && primary_keys.is_empty() + { + return plan_err!("Debezium source must have at least one PRIMARY KEY field"); + } + + table.key_constraints = primary_keys; + + Ok(table) + } + + pub fn has_virtual_fields(&self) -> bool { + self.schema_specs.iter().any(|c| c.is_computed()) + } + + pub fn is_updating(&self) -> bool { + self.connection_format + .as_ref() + .is_some_and(|f| f.is_updating()) + || self.payload_format == Some(DataEncodingFormat::DebeziumJson) + } + + pub fn connector_op(&self) -> ConnectorOp { + let physical = self.produce_physical_schema(); + let fields: Vec = physical + .fields() + .iter() + .map(|f| f.as_ref().clone()) + .collect(); + let fs_schema = FsSchema::from_fields(fields); + + ConnectorOp { + connector: self.adapter_type.clone(), + fs_schema: Some(fs_schema.into()), + name: self.table_identifier.clone(), + description: self.description.clone(), + config: Some(self.connector_config.to_proto_config()), + } + } + + pub fn processing_mode(&self) -> ProcessingMode { + if self.is_updating() { + ProcessingMode::Update + } else { + ProcessingMode::Append + } + } + + pub fn timestamp_override(&self) -> Result> { + if let Some(field_name) = self.temporal_config.event_column.clone() { + if self.is_updating() { + return plan_err!("can't use event_time_field with update mode"); + } + let _field = self.get_time_column(&field_name)?; + Ok(Some(Expr::Column(Column::from_name(field_name.as_str())))) + } else { + Ok(None) + } + } + + fn get_time_column(&self, field_name: &str) -> Result<&ColumnDescriptor> { + self.schema_specs + .iter() + .find(|c| { + c.arrow_field().name() == field_name + && matches!(c.arrow_field().data_type(), DataType::Timestamp(..)) + }) + .ok_or_else(|| { + DataFusionError::Plan(format!("field {field_name} not found or not a timestamp")) + }) + } + + pub fn watermark_column(&self) -> Result> { + if let Some(field_name) = self.temporal_config.watermark_strategy_column.clone() { + let _field = self.get_time_column(&field_name)?; + Ok(Some(Expr::Column(Column::from_name(field_name.as_str())))) + } else { + Ok(None) + } + } + + pub fn as_sql_source(&self) -> Result { + match self.role { + TableRole::Ingestion => {} + TableRole::Egress | TableRole::Reference => { + return plan_err!("cannot read from sink"); + } + }; + + if self.is_updating() && self.has_virtual_fields() { + return plan_err!("can't read from a source with virtual fields and update mode."); + } + + let timestamp_override = self.timestamp_override()?; + let watermark_column = self.watermark_column()?; + + let source = SqlSource { + id: self.registry_id, + struct_def: self + .schema_specs + .iter() + .filter(|c| !c.is_computed()) + .map(|c| Arc::new(c.arrow_field().clone())) + .collect(), + config: self.connector_op(), + processing_mode: self.processing_mode(), + idle_time: self.temporal_config.liveness_timeout, + }; + + Ok(SourceOperator { + name: self.table_identifier.clone(), + source, + timestamp_override, + watermark_column, + }) + } +} + +/// Plan a SQL scalar expression against a table-qualified schema (e.g. watermark `AS` clause). +fn plan_generating_expr( + ast: &ast::Expr, + df_schema: &DFSchema, + schema_provider: &StreamSchemaProvider, +) -> Result { + let planner = SqlToRel::new(schema_provider); + let mut ctx = PlannerContext::new(); + planner.sql_to_expr(ast.clone(), df_schema, &mut ctx) +} + +#[derive(Debug, Clone)] +pub struct SourceOperator { + pub name: String, + pub source: SqlSource, + pub timestamp_override: Option, + pub watermark_column: Option, +} diff --git a/src/sql/schema/table.rs b/src/sql/schema/table.rs new file mode 100644 index 00000000..efa0c59f --- /dev/null +++ b/src/sql/schema/table.rs @@ -0,0 +1,162 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::source_table::SourceTable; +use crate::sql::analysis::rewrite_plan; +use crate::sql::logical_node::remote_table::RemoteTableBoundaryNode; +use crate::sql::logical_planner::optimizers::produce_optimized_plan; +use crate::sql::schema::StreamSchemaProvider; +use crate::sql::types::{ProcessingMode, QualifiedField}; +use datafusion::arrow::datatypes::FieldRef; +use datafusion::common::{Result, plan_err}; +use datafusion::logical_expr::{Extension, LogicalPlan}; +use datafusion::sql::sqlparser::ast::Statement; +use protocol::function_stream_graph::ConnectorOp; +use std::sync::Arc; +use std::time::Duration; + +/// Represents all table types in the FunctionStream SQL catalog. +#[allow(clippy::enum_variant_names)] +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum Table { + /// A lookup table backed by an external connector. + LookupTable(SourceTable), + /// A source/sink table backed by an external connector. + ConnectorTable(SourceTable), + /// A table defined by a query (CREATE VIEW / CREATE TABLE AS SELECT). + TableFromQuery { + name: String, + logical_plan: LogicalPlan, + }, +} + +impl Table { + /// Try to construct a Table from a CREATE TABLE or CREATE VIEW statement. + pub fn try_from_statement( + statement: &Statement, + schema_provider: &StreamSchemaProvider, + ) -> Result> { + use datafusion::logical_expr::{CreateMemoryTable, CreateView, DdlStatement}; + use datafusion::sql::sqlparser::ast::CreateTable; + + if let Statement::CreateTable(CreateTable { query: None, .. }) = statement { + return plan_err!( + "CREATE TABLE without AS SELECT is not supported; use CREATE TABLE ... AS SELECT or a connector table" + ); + } + + match produce_optimized_plan(statement, schema_provider) { + Ok(LogicalPlan::Ddl(DdlStatement::CreateView(CreateView { name, input, .. }))) + | Ok(LogicalPlan::Ddl(DdlStatement::CreateMemoryTable(CreateMemoryTable { + name, + input, + .. + }))) => { + let rewritten = rewrite_plan(input.as_ref().clone(), schema_provider)?; + let schema = rewritten.schema().clone(); + let remote = RemoteTableBoundaryNode { + upstream_plan: rewritten, + table_identifier: name.to_owned(), + resolved_schema: schema, + requires_materialization: true, + }; + Ok(Some(Table::TableFromQuery { + name: name.to_string(), + logical_plan: LogicalPlan::Extension(Extension { + node: Arc::new(remote), + }), + })) + } + _ => Ok(None), + } + } + + pub fn name(&self) -> &str { + match self { + Table::TableFromQuery { name, .. } => name.as_str(), + Table::ConnectorTable(c) | Table::LookupTable(c) => c.name(), + } + } + + pub fn get_fields(&self) -> Vec { + match self { + Table::ConnectorTable(SourceTable { + schema_specs, + inferred_fields, + .. + }) + | Table::LookupTable(SourceTable { + schema_specs, + inferred_fields, + .. + }) => inferred_fields.clone().unwrap_or_else(|| { + schema_specs + .iter() + .map(|c| Arc::new(c.arrow_field().clone())) + .collect() + }), + Table::TableFromQuery { logical_plan, .. } => { + logical_plan.schema().fields().iter().cloned().collect() + } + } + } + + pub fn set_inferred_fields(&mut self, fields: Vec) -> Result<()> { + let Table::ConnectorTable(t) = self else { + return Ok(()); + }; + + if !t.schema_specs.is_empty() { + return Ok(()); + } + + if let Some(existing) = &t.inferred_fields { + let matches = existing.len() == fields.len() + && existing + .iter() + .zip(&fields) + .all(|(a, b)| a.name() == b.name() && a.data_type() == b.data_type()); + + if !matches { + return plan_err!("all inserts into a table must share the same schema"); + } + } + + let fields: Vec<_> = fields.into_iter().map(|f| f.field().clone()).collect(); + t.inferred_fields.replace(fields); + + Ok(()) + } + + pub fn connector_op(&self) -> Result { + match self { + Table::ConnectorTable(c) | Table::LookupTable(c) => Ok(c.connector_op()), + Table::TableFromQuery { .. } => plan_err!("can't write to a query-defined table"), + } + } + + pub fn partition_exprs(&self) -> Option<&Vec> { + match self { + Table::ConnectorTable(c) => (*c.partition_exprs).as_ref(), + _ => None, + } + } +} + +#[derive(Clone, Debug)] +pub struct SqlSource { + pub id: Option, + pub struct_def: Vec, + pub config: ConnectorOp, + pub processing_mode: ProcessingMode, + pub idle_time: Option, +} diff --git a/src/sql/schema/table_execution_unit.rs b/src/sql/schema/table_execution_unit.rs new file mode 100644 index 00000000..c23dda7a --- /dev/null +++ b/src/sql/schema/table_execution_unit.rs @@ -0,0 +1,33 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::temporal_pipeline_config::TemporalPipelineConfig; + +#[derive(Debug, Clone)] +pub struct EngineDescriptor { + pub engine_type: String, + pub raw_payload: String, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum SyncMode { + AppendOnly, + Incremental, +} + +#[derive(Debug, Clone)] +pub struct TableExecutionUnit { + pub label: String, + pub engine_meta: EngineDescriptor, + pub sync_mode: SyncMode, + pub temporal_offset: TemporalPipelineConfig, +} diff --git a/src/sql/schema/table_role.rs b/src/sql/schema/table_role.rs new file mode 100644 index 00000000..7d301f9d --- /dev/null +++ b/src/sql/schema/table_role.rs @@ -0,0 +1,104 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use datafusion::arrow::datatypes::{DataType, TimeUnit}; +use datafusion::common::{Result, plan_err}; +use datafusion::error::DataFusionError; + +use super::column_descriptor::ColumnDescriptor; +use super::connection_type::ConnectionType; +use crate::sql::common::constants::{ + SUPPORTED_CONNECTOR_ADAPTERS, connection_table_role, connector_type, +}; +use crate::sql::common::with_option_keys as opt; + +/// Role of a connector-backed table in the pipeline (ingest / egress / lookup). +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum TableRole { + Ingestion, + Egress, + Reference, +} + +impl From for ConnectionType { + fn from(r: TableRole) -> Self { + match r { + TableRole::Ingestion => ConnectionType::Source, + TableRole::Egress => ConnectionType::Sink, + TableRole::Reference => ConnectionType::Lookup, + } + } +} + +impl From for TableRole { + fn from(c: ConnectionType) -> Self { + match c { + ConnectionType::Source => TableRole::Ingestion, + ConnectionType::Sink => TableRole::Egress, + ConnectionType::Lookup => TableRole::Reference, + } + } +} + +pub fn validate_adapter_availability(adapter: &str) -> Result<()> { + if !SUPPORTED_CONNECTOR_ADAPTERS.contains(&adapter) { + return Err(DataFusionError::Plan(format!( + "Unknown adapter '{adapter}'" + ))); + } + Ok(()) +} + +pub fn apply_adapter_specific_rules( + adapter: &str, + mut cols: Vec, +) -> Vec { + match adapter { + a if a == connector_type::DELTA || a == connector_type::ICEBERG => { + for c in &mut cols { + if matches!(c.data_type(), DataType::Timestamp(_, _)) { + c.force_precision(TimeUnit::Microsecond); + } + } + cols + } + _ => cols, + } +} + +pub fn deduce_role(options: &HashMap) -> Result { + match options.get(opt::TYPE).map(|s| s.as_str()) { + None | Some(connection_table_role::SOURCE) => Ok(TableRole::Ingestion), + Some(connection_table_role::SINK) => Ok(TableRole::Egress), + Some(connection_table_role::LOOKUP) => Ok(TableRole::Reference), + Some(other) => plan_err!("Invalid role '{other}'"), + } +} + +pub fn serialize_backend_params( + adapter: &str, + options: &HashMap, +) -> Result { + let mut payload = serde_json::Map::new(); + payload.insert( + opt::ADAPTER.to_string(), + serde_json::Value::String(adapter.to_string()), + ); + + for (k, v) in options { + payload.insert(k.clone(), serde_json::Value::String(v.clone())); + } + + serde_json::to_string(&payload).map_err(|e| DataFusionError::Plan(e.to_string())) +} diff --git a/src/sql/schema/temporal_pipeline_config.rs b/src/sql/schema/temporal_pipeline_config.rs new file mode 100644 index 00000000..f672e552 --- /dev/null +++ b/src/sql/schema/temporal_pipeline_config.rs @@ -0,0 +1,58 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::time::Duration; + +use datafusion::common::{Result, plan_err}; +use datafusion::logical_expr::Expr; + +use super::column_descriptor::ColumnDescriptor; +use crate::sql::common::constants::sql_field; + +/// Event-time and watermark configuration for streaming tables. +#[derive(Debug, Clone, Default, PartialEq, Eq, Hash)] +pub struct TemporalPipelineConfig { + pub event_column: Option, + pub watermark_strategy_column: Option, + pub liveness_timeout: Option, +} + +#[derive(Debug, Clone)] +pub struct TemporalSpec { + pub time_field: String, + pub watermark_expr: Option, +} + +pub fn resolve_temporal_logic( + columns: &[ColumnDescriptor], + time_meta: Option, +) -> Result { + let mut config = TemporalPipelineConfig::default(); + + if let Some(meta) = time_meta { + let field_exists = columns + .iter() + .any(|c| c.arrow_field().name() == meta.time_field.as_str()); + if !field_exists { + return plan_err!("Temporal field {} does not exist", meta.time_field); + } + config.event_column = Some(meta.time_field.clone()); + + if meta.watermark_expr.is_some() { + config.watermark_strategy_column = Some(sql_field::COMPUTED_WATERMARK.to_string()); + } else { + config.watermark_strategy_column = Some(meta.time_field); + } + } + + Ok(config) +} diff --git a/src/sql/schema/utils.rs b/src/sql/schema/utils.rs new file mode 100644 index 00000000..45254e5f --- /dev/null +++ b/src/sql/schema/utils.rs @@ -0,0 +1,79 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::sync::Arc; + +use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit}; +use datafusion::common::{DFSchema, DFSchemaRef, Result as DFResult, TableReference}; + +use crate::sql::common::constants::window_interval_field; +use crate::sql::types::{QualifiedField, TIMESTAMP_FIELD}; + +/// Returns the Arrow struct type for a window (start, end) pair. +pub fn window_arrow_struct() -> DataType { + DataType::Struct( + vec![ + Arc::new(Field::new( + window_interval_field::START, + DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + )), + Arc::new(Field::new( + window_interval_field::END, + DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + )), + ] + .into(), + ) +} + +/// Adds a `_timestamp` field to a DFSchema if it doesn't already have one. +pub fn add_timestamp_field( + schema: DFSchemaRef, + qualifier: Option, +) -> DFResult { + if has_timestamp_field(&schema) { + return Ok(schema); + } + + let timestamp_field = QualifiedField::new( + qualifier, + TIMESTAMP_FIELD, + DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + ); + Ok(Arc::new(schema.join(&DFSchema::new_with_metadata( + vec![timestamp_field.into()], + HashMap::new(), + )?)?)) +} + +/// Checks whether a DFSchema contains a `_timestamp` field. +pub fn has_timestamp_field(schema: &DFSchemaRef) -> bool { + schema + .fields() + .iter() + .any(|field| field.name() == TIMESTAMP_FIELD) +} + +/// Adds a `_timestamp` field to an Arrow Schema, returning a new SchemaRef. +pub fn add_timestamp_field_arrow(schema: Schema) -> SchemaRef { + let mut fields = schema.fields().to_vec(); + fields.push(Arc::new(Field::new( + TIMESTAMP_FIELD, + DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + ))); + Arc::new(Schema::new(fields)) +} diff --git a/src/sql/types/data_type.rs b/src/sql/types/data_type.rs new file mode 100644 index 00000000..070324d5 --- /dev/null +++ b/src/sql/types/data_type.rs @@ -0,0 +1,161 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use datafusion::arrow::datatypes::{ + DECIMAL_DEFAULT_SCALE, DECIMAL128_MAX_PRECISION, DataType, Field, IntervalUnit, TimeUnit, +}; +use datafusion::common::{Result, plan_datafusion_err, plan_err}; + +use crate::sql::common::FsExtensionType; +use crate::sql::common::constants::planning_placeholder_udf; + +pub fn convert_data_type( + sql_type: &datafusion::sql::sqlparser::ast::DataType, +) -> Result<(DataType, Option)> { + use datafusion::sql::sqlparser::ast::ArrayElemTypeDef; + use datafusion::sql::sqlparser::ast::DataType as SQLDataType; + + match sql_type { + SQLDataType::Array(ArrayElemTypeDef::AngleBracket(inner_sql_type)) + | SQLDataType::Array(ArrayElemTypeDef::SquareBracket(inner_sql_type, _)) => { + let (data_type, extension) = convert_simple_data_type(inner_sql_type)?; + + Ok(( + DataType::List(Arc::new(FsExtensionType::add_metadata( + extension, + Field::new( + planning_placeholder_udf::LIST_ELEMENT_FIELD, + data_type, + true, + ), + ))), + None, + )) + } + SQLDataType::Array(ArrayElemTypeDef::None) => { + plan_err!("Arrays with unspecified type is not supported") + } + other => convert_simple_data_type(other), + } +} + +fn convert_simple_data_type( + sql_type: &datafusion::sql::sqlparser::ast::DataType, +) -> Result<(DataType, Option)> { + use datafusion::sql::sqlparser::ast::DataType as SQLDataType; + use datafusion::sql::sqlparser::ast::{ExactNumberInfo, TimezoneInfo}; + + if matches!(sql_type, SQLDataType::JSON) { + return Ok((DataType::Utf8, Some(FsExtensionType::JSON))); + } + + let dt = match sql_type { + SQLDataType::Boolean | SQLDataType::Bool => Ok(DataType::Boolean), + SQLDataType::TinyInt(_) => Ok(DataType::Int8), + SQLDataType::SmallInt(_) | SQLDataType::Int2(_) => Ok(DataType::Int16), + SQLDataType::Int(_) | SQLDataType::Integer(_) | SQLDataType::Int4(_) => Ok(DataType::Int32), + SQLDataType::BigInt(_) | SQLDataType::Int8(_) => Ok(DataType::Int64), + SQLDataType::TinyIntUnsigned(_) => Ok(DataType::UInt8), + SQLDataType::SmallIntUnsigned(_) | SQLDataType::Int2Unsigned(_) => Ok(DataType::UInt16), + SQLDataType::IntUnsigned(_) + | SQLDataType::UnsignedInteger + | SQLDataType::Int4Unsigned(_) => Ok(DataType::UInt32), + SQLDataType::BigIntUnsigned(_) | SQLDataType::Int8Unsigned(_) => Ok(DataType::UInt64), + SQLDataType::Float(_) => Ok(DataType::Float32), + SQLDataType::Real | SQLDataType::Float4 => Ok(DataType::Float32), + SQLDataType::Double(_) | SQLDataType::DoublePrecision | SQLDataType::Float8 => { + Ok(DataType::Float64) + } + SQLDataType::Char(_) + | SQLDataType::Varchar(_) + | SQLDataType::Text + | SQLDataType::String(_) => Ok(DataType::Utf8), + SQLDataType::Timestamp(None, TimezoneInfo::None) | SQLDataType::Datetime(_) => { + Ok(DataType::Timestamp(TimeUnit::Nanosecond, None)) + } + SQLDataType::Timestamp(Some(precision), TimezoneInfo::None) => match *precision { + 0 => Ok(DataType::Timestamp(TimeUnit::Second, None)), + 3 => Ok(DataType::Timestamp(TimeUnit::Millisecond, None)), + 6 => Ok(DataType::Timestamp(TimeUnit::Microsecond, None)), + 9 => Ok(DataType::Timestamp(TimeUnit::Nanosecond, None)), + _ => { + return plan_err!( + "unsupported precision {} -- supported precisions are 0 (seconds), \ + 3 (milliseconds), 6 (microseconds), and 9 (nanoseconds)", + precision + ); + } + }, + SQLDataType::Date => Ok(DataType::Date32), + SQLDataType::Time(None, tz_info) => { + if matches!(tz_info, TimezoneInfo::None) + || matches!(tz_info, TimezoneInfo::WithoutTimeZone) + { + Ok(DataType::Time64(TimeUnit::Nanosecond)) + } else { + return plan_err!("Unsupported SQL type {sql_type:?}"); + } + } + SQLDataType::Numeric(exact_number_info) | SQLDataType::Decimal(exact_number_info) => { + let (precision, scale) = match *exact_number_info { + ExactNumberInfo::None => (None, None), + ExactNumberInfo::Precision(precision) => (Some(precision), None), + ExactNumberInfo::PrecisionAndScale(precision, scale) => { + (Some(precision), Some(scale)) + } + }; + make_decimal_type(precision, scale) + } + SQLDataType::Bytea => Ok(DataType::Binary), + SQLDataType::Interval => Ok(DataType::Interval(IntervalUnit::MonthDayNano)), + SQLDataType::Struct(fields, _) => { + let fields: Vec<_> = fields + .iter() + .map(|f| { + Ok::<_, datafusion::error::DataFusionError>(Arc::new(Field::new( + f.field_name + .as_ref() + .ok_or_else(|| { + plan_datafusion_err!("anonymous struct fields are not allowed") + })? + .to_string(), + convert_data_type(&f.field_type)?.0, + true, + ))) + }) + .collect::>()?; + Ok(DataType::Struct(fields.into())) + } + _ => return plan_err!("Unsupported SQL type {sql_type:?}"), + }; + + Ok((dt?, None)) +} + +fn make_decimal_type(precision: Option, scale: Option) -> Result { + let (precision, scale) = match (precision, scale) { + (Some(p), Some(s)) => (p as u8, s as i8), + (Some(p), None) => (p as u8, 0), + (None, Some(_)) => return plan_err!("Cannot specify only scale for decimal data type"), + (None, None) => (DECIMAL128_MAX_PRECISION, DECIMAL_DEFAULT_SCALE), + }; + + if precision == 0 || precision > DECIMAL128_MAX_PRECISION || scale.unsigned_abs() > precision { + plan_err!( + "Decimal(precision = {precision}, scale = {scale}) should satisfy `0 < precision <= 38`, and `scale <= precision`." + ) + } else { + Ok(DataType::Decimal128(precision, scale)) + } +} diff --git a/src/sql/types/df_field.rs b/src/sql/types/df_field.rs new file mode 100644 index 00000000..a32d7bc8 --- /dev/null +++ b/src/sql/types/df_field.rs @@ -0,0 +1,181 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::sync::Arc; + +use datafusion::arrow::datatypes::{DataType, Field, FieldRef}; +use datafusion::common::{Column, DFSchema, Result, TableReference}; + +// ============================================================================ +// QualifiedField (Strongly-typed Field Wrapper) +// ============================================================================ + +/// Arrow [`Field`] plus optional SQL [`TableReference`] qualifier. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct QualifiedField { + qualifier: Option, + field: FieldRef, +} + +// ============================================================================ +// Type Conversions (Interoperability with DataFusion) +// ============================================================================ + +impl From<(Option, FieldRef)> for QualifiedField { + fn from((qualifier, field): (Option, FieldRef)) -> Self { + Self { qualifier, field } + } +} + +impl From<(Option<&TableReference>, &Field)> for QualifiedField { + fn from((qualifier, field): (Option<&TableReference>, &Field)) -> Self { + Self { + qualifier: qualifier.cloned(), + field: Arc::new(field.clone()), + } + } +} + +impl From for (Option, FieldRef) { + fn from(value: QualifiedField) -> Self { + (value.qualifier, value.field) + } +} + +// ============================================================================ +// Core API +// ============================================================================ + +impl QualifiedField { + pub fn new( + qualifier: Option, + name: impl Into, + data_type: DataType, + nullable: bool, + ) -> Self { + Self { + qualifier, + field: Arc::new(Field::new(name, data_type, nullable)), + } + } + + pub fn new_unqualified(name: &str, data_type: DataType, nullable: bool) -> Self { + Self { + qualifier: None, + field: Arc::new(Field::new(name, data_type, nullable)), + } + } + + #[inline] + pub fn name(&self) -> &str { + self.field.name() + } + + #[inline] + pub fn data_type(&self) -> &DataType { + self.field.data_type() + } + + #[inline] + pub fn is_nullable(&self) -> bool { + self.field.is_nullable() + } + + #[inline] + pub fn metadata(&self) -> &HashMap { + self.field.metadata() + } + + #[inline] + pub fn qualifier(&self) -> Option<&TableReference> { + self.qualifier.as_ref() + } + + #[inline] + pub fn field(&self) -> &FieldRef { + &self.field + } + + pub fn qualified_name(&self) -> String { + match &self.qualifier { + Some(qualifier) => format!("{}.{}", qualifier, self.field.name()), + None => self.field.name().to_owned(), + } + } + + pub fn qualified_column(&self) -> Column { + Column { + relation: self.qualifier.clone(), + name: self.field.name().to_string(), + spans: Default::default(), + } + } + + pub fn unqualified_column(&self) -> Column { + Column { + relation: None, + name: self.field.name().to_string(), + spans: Default::default(), + } + } + + pub fn strip_qualifier(mut self) -> Self { + self.qualifier = None; + self + } + + pub fn with_nullable(mut self, nullable: bool) -> Self { + if self.field.is_nullable() == nullable { + return self; + } + let field = Arc::try_unwrap(self.field).unwrap_or_else(|arc| (*arc).clone()); + self.field = Arc::new(field.with_nullable(nullable)); + self + } + + pub fn with_metadata(mut self, metadata: HashMap) -> Self { + let field = Arc::try_unwrap(self.field).unwrap_or_else(|arc| (*arc).clone()); + self.field = Arc::new(field.with_metadata(metadata)); + self + } +} + +// ============================================================================ +// Schema Collection Helpers +// ============================================================================ + +pub fn extract_qualified_fields(schema: &DFSchema) -> Vec { + schema + .fields() + .iter() + .enumerate() + .map(|(i, field)| { + let (qualifier, _) = schema.qualified_field(i); + QualifiedField { + qualifier: qualifier.cloned(), + field: field.clone(), + } + }) + .collect() +} + +pub fn build_df_schema(fields: &[QualifiedField]) -> Result { + build_df_schema_with_metadata(fields, HashMap::new()) +} + +pub fn build_df_schema_with_metadata( + fields: &[QualifiedField], + metadata: HashMap, +) -> Result { + DFSchema::new_with_metadata(fields.iter().map(|f| f.clone().into()).collect(), metadata) +} diff --git a/src/sql/types/mod.rs b/src/sql/types/mod.rs new file mode 100644 index 00000000..c9d80681 --- /dev/null +++ b/src/sql/types/mod.rs @@ -0,0 +1,62 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod data_type; +mod df_field; +pub(crate) mod placeholder_udf; +mod stream_schema; +mod window; + +use std::time::Duration; + +use crate::sql::common::constants::sql_planning_default; + +pub use df_field::{ + QualifiedField, build_df_schema, build_df_schema_with_metadata, extract_qualified_fields, +}; +pub(crate) use placeholder_udf::PlanningPlaceholderUdf; +pub(crate) use window::WindowBehavior; +pub use window::{WindowType, extract_window_type}; + +pub use crate::sql::common::constants::sql_field::TIMESTAMP_FIELD; + +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ProcessingMode { + Append, + Update, +} + +#[derive(Clone, Debug)] +pub struct SqlConfig { + pub default_parallelism: usize, +} + +impl Default for SqlConfig { + fn default() -> Self { + Self { + default_parallelism: sql_planning_default::DEFAULT_PARALLELISM, + } + } +} + +#[derive(Clone)] +pub struct PlanningOptions { + pub ttl: Duration, +} + +impl Default for PlanningOptions { + fn default() -> Self { + Self { + ttl: Duration::from_secs(sql_planning_default::PLANNING_TTL_SECS), + } + } +} diff --git a/src/sql/types/placeholder_udf.rs b/src/sql/types/placeholder_udf.rs new file mode 100644 index 00000000..059637e9 --- /dev/null +++ b/src/sql/types/placeholder_udf.rs @@ -0,0 +1,79 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::fmt::{Debug, Formatter}; +use std::sync::Arc; + +use datafusion::arrow::datatypes::DataType; +use datafusion::common::{Result, internal_err}; +use datafusion::logical_expr::{ + ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature, Volatility, +}; + +// ============================================================================ +// PlanningPlaceholderUdf +// ============================================================================ + +/// Logical-planning-only UDF: satisfies type checking until real functions are wired in. +pub(crate) struct PlanningPlaceholderUdf { + name: String, + signature: Signature, + return_type: DataType, +} + +impl Debug for PlanningPlaceholderUdf { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "PlanningPlaceholderUDF<{}>", self.name) + } +} + +impl ScalarUDFImpl for PlanningPlaceholderUdf { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + &self.name + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(self.return_type.clone()) + } + + fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result { + internal_err!( + "PlanningPlaceholderUDF '{}' was invoked during physical execution. \ + This indicates a bug in the stream query compiler: placeholders must be \ + swapped with actual physical UDFs before execution begins.", + self.name + ) + } +} + +impl PlanningPlaceholderUdf { + pub fn new_with_return( + name: impl Into, + args: Vec, + return_type: DataType, + ) -> Arc { + Arc::new(ScalarUDF::new_from_impl(Self { + name: name.into(), + signature: Signature::exact(args, Volatility::Volatile), + return_type, + })) + } +} diff --git a/src/sql/types/stream_schema.rs b/src/sql/types/stream_schema.rs new file mode 100644 index 00000000..c973386e --- /dev/null +++ b/src/sql/types/stream_schema.rs @@ -0,0 +1,133 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use datafusion::arrow::datatypes::{Field, Schema, SchemaRef}; +use datafusion::common::{DataFusionError, Result}; + +use super::TIMESTAMP_FIELD; + +// ============================================================================ +// StreamSchema +// ============================================================================ + +/// Schema wrapper for continuous streaming: requires event-time (`TIMESTAMP_FIELD`) for watermarks +/// and optionally tracks key column indices for partitioned state / shuffle. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct StreamSchema { + schema: SchemaRef, + timestamp_index: usize, + key_indices: Option>, +} + +impl StreamSchema { + // ======================================================================== + // Raw Constructors (When indices are strictly known in advance) + // ======================================================================== + + /// Keyed stream when indices are already verified. + pub fn new_keyed(schema: SchemaRef, timestamp_index: usize, key_indices: Vec) -> Self { + Self { + schema, + timestamp_index, + key_indices: Some(key_indices), + } + } + + /// Unkeyed stream when `timestamp_index` is already verified. + pub fn new_unkeyed(schema: SchemaRef, timestamp_index: usize) -> Self { + Self { + schema, + timestamp_index, + key_indices: None, + } + } + + // ======================================================================== + // Safe Builders (Dynamically resolves and validates indices) + // ======================================================================== + + /// Unkeyed stream from a field list. Replaces the old `unwrap_or(0)` default when the timestamp + /// column was missing (silent wrong index / corruption). + pub fn try_from_fields(fields: impl Into>) -> Result { + let schema = Arc::new(Schema::new(fields.into())); + Self::try_from_schema_unkeyed(schema) + } + + /// Keyed stream from `SchemaRef`; resolves and validates the mandatory timestamp column. + pub fn try_from_schema_keyed(schema: SchemaRef, key_indices: Vec) -> Result { + let timestamp_index = schema + .column_with_name(TIMESTAMP_FIELD) + .ok_or_else(|| { + DataFusionError::Plan(format!( + "Streaming Topology Error: Mandatory event-time field '{}' is missing in the schema. \ + Current schema fields: {:?}", + TIMESTAMP_FIELD, + schema.fields() + )) + })? + .0; + + Ok(Self { + schema, + timestamp_index, + key_indices: Some(key_indices), + }) + } + + /// Unkeyed stream from `SchemaRef`; resolves and validates the mandatory timestamp column. + pub fn try_from_schema_unkeyed(schema: SchemaRef) -> Result { + let timestamp_index = schema + .column_with_name(TIMESTAMP_FIELD) + .ok_or_else(|| { + DataFusionError::Plan(format!( + "Streaming Topology Error: Mandatory event-time field '{}' is missing.", + TIMESTAMP_FIELD + )) + })? + .0; + + Ok(Self { + schema, + timestamp_index, + key_indices: None, + }) + } + + // ======================================================================== + // Zero-cost Getters + // ======================================================================== + + /// Underlying Arrow schema. + #[inline] + pub fn arrow_schema(&self) -> &SchemaRef { + &self.schema + } + + /// Physical column index used as event time / watermark driver. + #[inline] + pub fn timestamp_index(&self) -> usize { + self.timestamp_index + } + + /// Key column indices for shuffle / state, if keyed. + #[inline] + pub fn key_indices(&self) -> Option<&[usize]> { + self.key_indices.as_deref() + } + + #[inline] + pub fn is_keyed(&self) -> bool { + self.key_indices.is_some() + } +} diff --git a/src/sql/types/window.rs b/src/sql/types/window.rs new file mode 100644 index 00000000..1aa05f42 --- /dev/null +++ b/src/sql/types/window.rs @@ -0,0 +1,134 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::time::Duration; + +use datafusion::common::{Result, ScalarValue, not_impl_err, plan_err}; +use datafusion::logical_expr::Expr; +use datafusion::logical_expr::expr::{Alias, ScalarFunction}; + +use crate::sql::common::constants::window_fn; + +use super::QualifiedField; + +// ============================================================================ +// Window Definitions +// ============================================================================ + +/// Temporal windowing semantics for streaming aggregations. +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub enum WindowType { + Tumbling { width: Duration }, + Sliding { width: Duration, slide: Duration }, + Session { gap: Duration }, + Instant, +} + +/// How windowing is represented in the physical plan. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) enum WindowBehavior { + FromOperator { + window: WindowType, + window_field: QualifiedField, + window_index: usize, + is_nested: bool, + }, + InData, +} + +// ============================================================================ +// Logical Expression Parsers +// ============================================================================ + +pub fn extract_duration(expression: &Expr) -> Result { + match expression { + Expr::Literal(ScalarValue::IntervalDayTime(Some(val)), _) => { + let secs = (val.days as u64) * 24 * 60 * 60; + let millis = val.milliseconds as u64; + Ok(Duration::from_secs(secs) + Duration::from_millis(millis)) + } + Expr::Literal(ScalarValue::IntervalMonthDayNano(Some(val)), _) => { + if val.months != 0 { + return not_impl_err!( + "Streaming engine does not support window durations specified in months due to variable month lengths." + ); + } + let secs = (val.days as u64) * 24 * 60 * 60; + let nanos = val.nanoseconds as u64; + Ok(Duration::from_secs(secs) + Duration::from_nanos(nanos)) + } + _ => plan_err!( + "Unsupported window duration expression. Expected an interval literal (e.g., INTERVAL '1' MINUTE), got: {}", + expression + ), + } +} + +pub fn extract_window_type(expression: &Expr) -> Result> { + match expression { + Expr::ScalarFunction(ScalarFunction { func, args }) => match func.name() { + name if name == window_fn::HOP => { + if args.len() != 2 { + return plan_err!( + "hop() window function expects exactly 2 arguments (slide, width), got {}", + args.len() + ); + } + + let slide = extract_duration(&args[0])?; + let width = extract_duration(&args[1])?; + + if width.as_nanos() % slide.as_nanos() != 0 { + return plan_err!( + "Streaming Topology Error: hop() window width {:?} must be a perfect multiple of slide {:?}", + width, + slide + ); + } + + if slide == width { + Ok(Some(WindowType::Tumbling { width })) + } else { + Ok(Some(WindowType::Sliding { width, slide })) + } + } + + name if name == window_fn::TUMBLE => { + if args.len() != 1 { + return plan_err!( + "tumble() window function expects exactly 1 argument (width), got {}", + args.len() + ); + } + let width = extract_duration(&args[0])?; + Ok(Some(WindowType::Tumbling { width })) + } + + name if name == window_fn::SESSION => { + if args.len() != 1 { + return plan_err!( + "session() window function expects exactly 1 argument (gap), got {}", + args.len() + ); + } + let gap = extract_duration(&args[0])?; + Ok(Some(WindowType::Session { gap })) + } + + _ => Ok(None), + }, + + Expr::Alias(Alias { expr, .. }) => extract_window_type(expr), + + _ => Ok(None), + } +} diff --git a/src/storage/mod.rs b/src/storage/mod.rs index a4898619..823425d2 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -11,4 +11,5 @@ // limitations under the License. pub mod state_backend; +pub mod stream_catalog; pub mod task; diff --git a/src/storage/stream_catalog/codec.rs b/src/storage/stream_catalog/codec.rs new file mode 100644 index 00000000..34c2c4ba --- /dev/null +++ b/src/storage/stream_catalog/codec.rs @@ -0,0 +1,57 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Arrow Schema IPC and [`LogicalProgram`] bincode payloads for stream catalog rows. + +use std::io::Cursor; +use std::sync::Arc; + +use datafusion::arrow::datatypes::Schema; +use datafusion::arrow::ipc::reader::StreamReader; +use datafusion::arrow::ipc::writer::StreamWriter; +use datafusion::arrow::record_batch::RecordBatch; +use datafusion::common::{DataFusionError, Result}; + +use crate::sql::logical_node::logical::LogicalProgram; + +pub struct CatalogCodec; + +impl CatalogCodec { + pub fn encode_schema(schema: &Arc) -> Result> { + let mut buffer = Vec::new(); + let empty_batch = RecordBatch::new_empty(Arc::clone(schema)); + let mut writer = StreamWriter::try_new(&mut buffer, schema.as_ref()) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + writer + .write(&empty_batch) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + writer + .finish() + .map_err(|e| DataFusionError::External(Box::new(e)))?; + Ok(buffer) + } + + pub fn decode_schema(bytes: &[u8]) -> Result> { + let cursor = Cursor::new(bytes); + let reader = StreamReader::try_new(cursor, None) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + Ok(reader.schema()) + } + + pub fn encode_logical_program(program: &LogicalProgram) -> Result> { + program.encode_for_catalog() + } + + pub fn decode_logical_program(bytes: &[u8]) -> Result { + LogicalProgram::decode_for_catalog(bytes) + } +} diff --git a/src/storage/stream_catalog/manager.rs b/src/storage/stream_catalog/manager.rs new file mode 100644 index 00000000..3804a95a --- /dev/null +++ b/src/storage/stream_catalog/manager.rs @@ -0,0 +1,640 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::{Arc, OnceLock}; + +use anyhow::{Context, anyhow, bail}; +use datafusion::common::{Result as DFResult, internal_err, plan_err}; +use prost::Message; +use protocol::function_stream_graph::FsProgram; +use protocol::storage::{self as pb, table_definition}; +use tracing::{info, warn}; +use unicase::UniCase; + +use crate::sql::common::constants::sql_field; +use crate::sql::schema::column_descriptor::ColumnDescriptor; +use crate::sql::schema::connection_type::ConnectionType; +use crate::sql::schema::source_table::SourceTable; +use crate::sql::schema::table::Table as CatalogTable; +use crate::sql::schema::{StreamPlanningContext, StreamTable}; + +use super::codec::CatalogCodec; +use super::meta_store::MetaStore; + +const CATALOG_KEY_PREFIX: &str = "catalog:stream_table:"; +const STREAMING_JOB_KEY_PREFIX: &str = "streaming_job:"; + +pub struct CatalogManager { + store: Arc, +} + +static GLOBAL_CATALOG: OnceLock> = OnceLock::new(); + +impl CatalogManager { + pub fn new(store: Arc) -> Self { + Self { store } + } + + pub fn init_global_in_memory() -> anyhow::Result<()> { + Self::init_global(Arc::new(super::InMemoryMetaStore::new())) + } + + pub fn init_global(store: Arc) -> anyhow::Result<()> { + if GLOBAL_CATALOG.get().is_some() { + bail!("CatalogManager already initialized"); + } + + let mgr = Arc::new(CatalogManager::new(store)); + GLOBAL_CATALOG + .set(mgr) + .map_err(|_| anyhow!("CatalogManager global install failed"))?; + + Ok(()) + } + + pub fn try_global() -> Option> { + GLOBAL_CATALOG.get().cloned() + } + + pub fn global() -> anyhow::Result> { + Self::try_global().ok_or_else(|| anyhow!("CatalogManager not initialized")) + } + + #[inline] + fn build_store_key(table_name: &str) -> String { + format!("{CATALOG_KEY_PREFIX}{}", table_name.to_lowercase()) + } + + #[inline] + fn build_streaming_job_key(table_name: &str) -> String { + format!("{STREAMING_JOB_KEY_PREFIX}{}", table_name.to_lowercase()) + } + + // ======================================================================== + // Streaming job persistence (CREATE STREAMING TABLE / DROP STREAMING TABLE) + // ======================================================================== + + pub fn persist_streaming_job( + &self, + table_name: &str, + fs_program: &FsProgram, + comment: &str, + ) -> DFResult<()> { + let program_bytes = fs_program.encode_to_vec(); + let def = pb::StreamingTableDefinition { + table_name: table_name.to_string(), + created_at_millis: chrono::Utc::now().timestamp_millis(), + fs_program_bytes: program_bytes, + comment: comment.to_string(), + }; + let payload = def.encode_to_vec(); + let key = Self::build_streaming_job_key(table_name); + self.store.put(&key, payload)?; + info!(table = %table_name, "Streaming job definition persisted"); + Ok(()) + } + + pub fn remove_streaming_job(&self, table_name: &str) -> DFResult<()> { + let key = Self::build_streaming_job_key(table_name); + self.store.delete(&key)?; + info!(table = %table_name, "Streaming job definition removed from store"); + Ok(()) + } + + pub fn load_streaming_job_definitions(&self) -> DFResult> { + let records = self.store.scan_prefix(STREAMING_JOB_KEY_PREFIX)?; + let mut out = Vec::with_capacity(records.len()); + for (key, payload) in records { + let def = match pb::StreamingTableDefinition::decode(payload.as_slice()) { + Ok(v) => v, + Err(e) => { + warn!( + key = %key, + error = %e, + "Skipping corrupted streaming job record" + ); + continue; + } + }; + let program = match FsProgram::decode(def.fs_program_bytes.as_slice()) { + Ok(v) => v, + Err(e) => { + warn!( + table = %def.table_name, + error = %e, + "Skipping streaming job with corrupted FsProgram" + ); + continue; + } + }; + out.push((def.table_name, program)); + } + Ok(out) + } + + // ======================================================================== + // Catalog table persistence (CREATE TABLE / DROP TABLE) + // ======================================================================== + + pub fn add_catalog_table(&self, table: CatalogTable) -> DFResult<()> { + let proto_def = self.encode_catalog_table(&table)?; + let payload = proto_def.encode_to_vec(); + let key = Self::build_store_key(table.name()); + + self.store.put(&key, payload)?; + Ok(()) + } + + pub fn has_catalog_table(&self, name: &str) -> bool { + let key = Self::build_store_key(name); + self.store.get(&key).ok().flatten().is_some() + } + + pub fn drop_catalog_table(&self, table_name: &str, if_exists: bool) -> DFResult<()> { + let key = Self::build_store_key(table_name); + let exists = self.store.get(&key)?.is_some(); + if !exists { + if if_exists { + return Ok(()); + } + return plan_err!("Table '{table_name}' not found"); + } + self.store.delete(&key)?; + Ok(()) + } + + pub fn restore_from_store(&self) -> DFResult<()> { + // No-op by design: the catalog is read-through from storage. + Ok(()) + } + + pub fn acquire_planning_context(&self) -> StreamPlanningContext { + let mut ctx = StreamPlanningContext::new(); + let catalogs = self.load_catalog_tables_map().unwrap_or_default(); + ctx.tables.catalogs = catalogs.clone(); + + for (name, table) in catalogs { + let source = match table.as_ref() { + CatalogTable::ConnectorTable(s) | CatalogTable::LookupTable(s) => s, + CatalogTable::TableFromQuery { .. } => continue, + }; + + let schema = Arc::new(source.produce_physical_schema()); + ctx.tables.streams.insert( + name, + Arc::new(StreamTable::Source { + name: source.name().to_string(), + connector: source.connector().to_string(), + schema, + event_time_field: source.event_time_field().map(str::to_string), + watermark_field: source.stream_catalog_watermark_field(), + with_options: source.catalog_with_options().clone(), + }), + ); + } + ctx + } + + /// All persisted catalog tables, sorted by table name. + pub fn list_catalog_tables(&self) -> DFResult>> { + let mut out: Vec> = + self.load_catalog_tables_map()?.into_values().collect(); + out.sort_by(|a, b| a.name().cmp(b.name())); + Ok(out) + } + + pub fn get_catalog_table(&self, name: &str) -> DFResult>> { + let key = UniCase::new(name.to_string()); + Ok(self.load_catalog_tables_map()?.get(&key).cloned()) + } + + pub fn add_table(&self, table: StreamTable) -> DFResult<()> { + match table { + StreamTable::Source { + name, + connector, + schema, + event_time_field, + watermark_field, + with_options, + } => { + let mut source = SourceTable::new(name, connector, ConnectionType::Source); + source.schema_specs = schema + .fields() + .iter() + .map(|f| ColumnDescriptor::new_physical((**f).clone())) + .collect(); + source.inferred_fields = Some(schema.fields().iter().cloned().collect()); + source.temporal_config.event_column = event_time_field; + source.temporal_config.watermark_strategy_column = watermark_field; + source.catalog_with_options = with_options; + self.add_catalog_table(CatalogTable::ConnectorTable(source)) + } + StreamTable::Sink { name, .. } => plan_err!( + "Persisting streaming sink '{name}' in stream catalog is no longer supported" + ), + } + } + + pub fn has_stream_table(&self, name: &str) -> bool { + self.has_catalog_table(name) + } + + pub fn drop_table(&self, table_name: &str, if_exists: bool) -> DFResult<()> { + self.drop_catalog_table(table_name, if_exists) + } + + pub fn list_stream_tables(&self) -> Vec> { + self.list_catalog_tables() + .unwrap_or_default() + .into_iter() + .filter_map(|t| match t.as_ref() { + CatalogTable::ConnectorTable(s) | CatalogTable::LookupTable(s) => { + Some(Arc::new(StreamTable::Source { + name: s.name().to_string(), + connector: s.connector().to_string(), + schema: Arc::new(s.produce_physical_schema()), + event_time_field: s.event_time_field().map(str::to_string), + watermark_field: s.stream_catalog_watermark_field(), + with_options: s.catalog_with_options().clone(), + })) + } + CatalogTable::TableFromQuery { .. } => None, + }) + .collect() + } + + pub fn get_stream_table(&self, name: &str) -> Option> { + self.get_catalog_table(name) + .ok() + .flatten() + .and_then(|t| match t.as_ref() { + CatalogTable::ConnectorTable(s) | CatalogTable::LookupTable(s) => { + Some(Arc::new(StreamTable::Source { + name: s.name().to_string(), + connector: s.connector().to_string(), + schema: Arc::new(s.produce_physical_schema()), + event_time_field: s.event_time_field().map(str::to_string), + watermark_field: s.stream_catalog_watermark_field(), + with_options: s.catalog_with_options().clone(), + })) + } + CatalogTable::TableFromQuery { .. } => None, + }) + } + + fn encode_catalog_table(&self, table: &CatalogTable) -> DFResult { + let table_type = match table { + CatalogTable::ConnectorTable(source) | CatalogTable::LookupTable(source) => { + let mut opts = source.catalog_with_options().clone(); + opts.entry("connector".to_string()) + .or_insert_with(|| source.connector().to_string()); + let catalog_row = pb::CatalogSourceTable { + arrow_schema_ipc: CatalogCodec::encode_schema(&Arc::new( + source.produce_physical_schema(), + ))?, + event_time_field: source.event_time_field().map(str::to_string), + watermark_field: source.stream_catalog_watermark_field(), + with_options: opts.into_iter().collect(), + connector: source.connector().to_string(), + description: source.description.clone(), + }; + if matches!(table, CatalogTable::LookupTable(_)) { + table_definition::TableType::LookupTable(catalog_row) + } else { + table_definition::TableType::ConnectorTable(catalog_row) + } + } + CatalogTable::TableFromQuery { name, .. } => { + return plan_err!( + "Persisting query-defined table '{}' is not supported by stream catalog storage", + name + ); + } + }; + + Ok(pb::TableDefinition { + table_name: table.name().to_string(), + updated_at_millis: chrono::Utc::now().timestamp_millis(), + table_type: Some(table_type), + }) + } + + fn decode_catalog_source_table( + &self, + table_name: String, + source_row: pb::CatalogSourceTable, + as_lookup: bool, + ) -> DFResult { + let connector = if source_row.connector.is_empty() { + source_row + .with_options + .get("connector") + .cloned() + .unwrap_or_else(|| "stream_catalog".to_string()) + } else { + source_row.connector.clone() + }; + let mut source = SourceTable::new( + table_name, + connector, + if as_lookup { + ConnectionType::Lookup + } else { + ConnectionType::Source + }, + ); + let schema = CatalogCodec::decode_schema(&source_row.arrow_schema_ipc)?; + source.schema_specs = schema + .fields() + .iter() + .map(|f| ColumnDescriptor::new_physical((**f).clone())) + .collect(); + source.inferred_fields = Some(schema.fields().iter().cloned().collect()); + source.temporal_config.event_column = source_row.event_time_field; + source.temporal_config.watermark_strategy_column = source_row + .watermark_field + .filter(|w| w != sql_field::COMPUTED_WATERMARK); + source.catalog_with_options = source_row.with_options.into_iter().collect(); + source.description = source_row.description; + + // Rebuild strongly-typed ConnectorConfig from persisted WITH options. + if source.connector().eq_ignore_ascii_case("kafka") { + use crate::sql::schema::ConnectorConfig; + use crate::sql::schema::kafka_operator_config::build_kafka_proto_config_from_string_map; + let opts_map: std::collections::HashMap = source + .catalog_with_options + .iter() + .map(|(k, v)| (k.clone(), v.clone())) + .collect(); + let physical = source.produce_physical_schema(); + if let Ok(proto_cfg) = build_kafka_proto_config_from_string_map(opts_map, &physical) { + source.connector_config = match proto_cfg { + protocol::function_stream_graph::connector_op::Config::KafkaSource(cfg) => { + ConnectorConfig::KafkaSource(cfg) + } + protocol::function_stream_graph::connector_op::Config::KafkaSink(cfg) => { + ConnectorConfig::KafkaSink(cfg) + } + protocol::function_stream_graph::connector_op::Config::Generic(g) => { + ConnectorConfig::Generic(g.properties) + } + }; + } + } else { + use crate::sql::schema::ConnectorConfig; + source.connector_config = ConnectorConfig::Generic( + source + .catalog_with_options + .iter() + .map(|(k, v)| (k.clone(), v.clone())) + .collect(), + ); + } + + if as_lookup { + Ok(CatalogTable::LookupTable(source)) + } else { + Ok(CatalogTable::ConnectorTable(source)) + } + } + + fn decode_catalog_table(&self, proto_def: pb::TableDefinition) -> DFResult { + let Some(table_type) = proto_def.table_type else { + return internal_err!( + "Corrupted catalog row: missing table_type for {}", + proto_def.table_name + ); + }; + + match table_type { + table_definition::TableType::ConnectorTable(src) => { + self.decode_catalog_source_table(proto_def.table_name, src, false) + } + table_definition::TableType::LookupTable(src) => { + self.decode_catalog_source_table(proto_def.table_name, src, true) + } + } + } + + fn load_catalog_tables_map( + &self, + ) -> DFResult>> + { + let mut out = std::collections::HashMap::new(); + let records = self.store.scan_prefix(CATALOG_KEY_PREFIX)?; + for (key, payload) in records { + let proto_def = match pb::TableDefinition::decode(payload.as_slice()) { + Ok(v) => v, + Err(e) => { + warn!( + catalog_key = %key, + error = %e, + "Skipping corrupted stream catalog row: protobuf decode failed" + ); + continue; + } + }; + let table = match self.decode_catalog_table(proto_def) { + Ok(v) => v, + Err(e) => { + warn!( + catalog_key = %key, + error = %e, + "Skipping unsupported/corrupted stream catalog row" + ); + continue; + } + }; + let object_name = UniCase::new(table.name().to_string()); + out.insert(object_name, Arc::new(table)); + } + Ok(out) + } +} + +pub fn restore_global_catalog_from_store() { + let Some(mgr) = CatalogManager::try_global() else { + return; + }; + match mgr.restore_from_store() { + Ok(()) => { + let n = mgr.list_catalog_tables().map(|t| t.len()).unwrap_or(0); + info!(catalog_tables = n, "Catalog loaded from durable store"); + } + Err(e) => warn!("Stream catalog restore_from_store failed: {e:#}"), + } +} + +pub fn restore_streaming_jobs_from_store() { + use crate::runtime::streaming::job::JobManager; + + let Some(catalog) = CatalogManager::try_global() else { + warn!("CatalogManager not available; skipping streaming job restore"); + return; + }; + let job_manager = match JobManager::global() { + Ok(jm) => jm, + Err(e) => { + warn!(error = %e, "JobManager not available; skipping streaming job restore"); + return; + } + }; + + let definitions = match catalog.load_streaming_job_definitions() { + Ok(defs) => defs, + Err(e) => { + warn!(error = %e, "Failed to load streaming job definitions from store"); + return; + } + }; + + if definitions.is_empty() { + info!("No persisted streaming jobs to restore"); + return; + } + + let total = definitions.len(); + info!(count = total, "Restoring persisted streaming jobs"); + + let rt = match tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + { + Ok(rt) => rt, + Err(e) => { + warn!( + error = %e, + "Failed to create Tokio runtime for streaming job restore" + ); + return; + } + }; + let mut restored = 0usize; + let mut failed = 0usize; + + for (table_name, fs_program) in definitions { + let jm = job_manager.clone(); + let name = table_name.clone(); + match rt.block_on(jm.submit_job(name.clone(), fs_program)) { + Ok(job_id) => { + info!(table = %table_name, job_id = %job_id, "Streaming job restored"); + restored += 1; + } + Err(e) => { + warn!(table = %table_name, error = %e, "Failed to restore streaming job"); + failed += 1; + } + } + } + + info!( + restored = restored, + failed = failed, + total = total, + "Streaming job restore complete" + ); +} + +pub fn initialize_stream_catalog(config: &crate::config::GlobalConfig) -> anyhow::Result<()> { + if !config.stream_catalog.persist { + return CatalogManager::init_global_in_memory() + .context("Stream catalog (CatalogManager) in-memory init failed"); + } + + let path = config + .stream_catalog + .db_path + .as_ref() + .map(|p| crate::config::resolve_path(p)) + .unwrap_or_else(|| crate::config::get_data_dir().join("stream_catalog")); + + std::fs::create_dir_all(&path).with_context(|| { + format!( + "Failed to create stream catalog directory {}", + path.display() + ) + })?; + + let store = std::sync::Arc::new(super::RocksDbMetaStore::open(&path).with_context(|| { + format!( + "Failed to open stream catalog RocksDB at {}", + path.display() + ) + })?); + + CatalogManager::init_global(store).context("Stream catalog (CatalogManager) init failed") +} + +#[allow(clippy::unwrap_or_default)] +pub fn planning_schema_provider() -> StreamPlanningContext { + CatalogManager::try_global() + .map(|m| m.acquire_planning_context()) + .unwrap_or_else(StreamPlanningContext::new) +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use datafusion::arrow::datatypes::{DataType, Field}; + + use crate::sql::schema::column_descriptor::ColumnDescriptor; + use crate::sql::schema::connection_type::ConnectionType; + use crate::sql::schema::source_table::SourceTable; + use crate::sql::schema::table::Table as CatalogTable; + use crate::storage::stream_catalog::InMemoryMetaStore; + + use super::CatalogManager; + + fn create_test_manager() -> CatalogManager { + CatalogManager::new(Arc::new(InMemoryMetaStore::new())) + } + + #[test] + fn add_table_roundtrip_snapshot() { + let mgr = create_test_manager(); + let mut source = SourceTable::new("t1", "kafka", ConnectionType::Source); + source.schema_specs = vec![ColumnDescriptor::new_physical(Field::new( + "a", + DataType::Int32, + false, + ))]; + source.temporal_config.event_column = Some("ts".into()); + let table = CatalogTable::ConnectorTable(source); + + mgr.add_catalog_table(table).unwrap(); + + let got = mgr.get_catalog_table("t1").unwrap().expect("table present"); + assert_eq!(got.name(), "t1"); + } + + #[test] + fn drop_table_if_exists() { + let mgr = create_test_manager(); + let mut source = SourceTable::new("t_drop", "kafka", ConnectionType::Source); + source.schema_specs = vec![ColumnDescriptor::new_physical(Field::new( + "a", + DataType::Int32, + false, + ))]; + mgr.add_catalog_table(CatalogTable::ConnectorTable(source)) + .unwrap(); + + mgr.drop_catalog_table("t_drop", false).unwrap(); + assert!(!mgr.has_catalog_table("t_drop")); + + mgr.drop_catalog_table("t_drop", true).unwrap(); + assert!(mgr.drop_catalog_table("nope", false).is_err()); + mgr.drop_catalog_table("nope", true).unwrap(); + } +} diff --git a/src/storage/stream_catalog/meta_store.rs b/src/storage/stream_catalog/meta_store.rs new file mode 100644 index 00000000..6f61b3f7 --- /dev/null +++ b/src/storage/stream_catalog/meta_store.rs @@ -0,0 +1,70 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Pluggable metadata KV backend (memory, etcd, Redis, …). + +use std::collections::HashMap; + +use datafusion::common::Result; +use parking_lot::RwLock; + +/// Synchronous metadata store for catalog records. +pub trait MetaStore: Send + Sync { + fn put(&self, key: &str, value: Vec) -> Result<()>; + fn get(&self, key: &str) -> Result>>; + fn delete(&self, key: &str) -> Result<()>; + fn scan_prefix(&self, prefix: &str) -> Result)>>; +} + +/// In-process KV store for single-node deployments and tests. +pub struct InMemoryMetaStore { + db: RwLock>>, +} + +impl InMemoryMetaStore { + pub fn new() -> Self { + Self { + db: RwLock::new(HashMap::new()), + } + } +} + +impl Default for InMemoryMetaStore { + fn default() -> Self { + Self::new() + } +} + +impl MetaStore for InMemoryMetaStore { + fn put(&self, key: &str, value: Vec) -> Result<()> { + self.db.write().insert(key.to_string(), value); + Ok(()) + } + + fn get(&self, key: &str) -> Result>> { + Ok(self.db.read().get(key).cloned()) + } + + fn delete(&self, key: &str) -> Result<()> { + self.db.write().remove(key); + Ok(()) + } + + fn scan_prefix(&self, prefix: &str) -> Result)>> { + let db = self.db.read(); + Ok(db + .iter() + .filter(|(k, _)| k.starts_with(prefix)) + .map(|(k, v)| (k.clone(), v.clone())) + .collect()) + } +} diff --git a/src/storage/stream_catalog/mod.rs b/src/storage/stream_catalog/mod.rs new file mode 100644 index 00000000..6f31317a --- /dev/null +++ b/src/storage/stream_catalog/mod.rs @@ -0,0 +1,25 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Stream table catalog: protobuf persistence, MVCC-style planning snapshots for the coordinator. + +mod codec; +mod manager; +mod meta_store; +mod rocksdb_meta_store; + +pub use manager::{ + CatalogManager, initialize_stream_catalog, restore_global_catalog_from_store, + restore_streaming_jobs_from_store, +}; +pub use meta_store::{InMemoryMetaStore, MetaStore}; +pub use rocksdb_meta_store::RocksDbMetaStore; diff --git a/src/storage/stream_catalog/rocksdb_meta_store.rs b/src/storage/stream_catalog/rocksdb_meta_store.rs new file mode 100644 index 00000000..5315454f --- /dev/null +++ b/src/storage/stream_catalog/rocksdb_meta_store.rs @@ -0,0 +1,123 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! RocksDB-backed [`super::MetaStore`] for durable stream catalog rows. + +use std::path::Path; +use std::sync::Arc; + +use anyhow::Context; +use datafusion::common::Result; +use rocksdb::{DB, Direction, IteratorMode, Options}; + +use super::MetaStore; + +/// Single-node durable KV used by [`crate::storage::stream_catalog::CatalogManager`]. +pub struct RocksDbMetaStore { + db: Arc, +} + +impl RocksDbMetaStore { + pub fn open>(path: P) -> anyhow::Result { + let path = path.as_ref(); + if let Some(parent) = path.parent() { + std::fs::create_dir_all(parent) + .with_context(|| format!("stream catalog: create parent directory {parent:?}"))?; + } + let mut opts = Options::default(); + opts.create_if_missing(true); + let db = DB::open(&opts, path) + .with_context(|| format!("stream catalog: open RocksDB at {}", path.display()))?; + Ok(Self { db: Arc::new(db) }) + } +} + +impl MetaStore for RocksDbMetaStore { + fn put(&self, key: &str, value: Vec) -> Result<()> { + self.db.put(key.as_bytes(), value.as_slice()).map_err(|e| { + datafusion::common::DataFusionError::Execution(format!("stream catalog store put: {e}")) + }) + } + + fn get(&self, key: &str) -> Result>> { + self.db.get(key.as_bytes()).map_err(|e| { + datafusion::common::DataFusionError::Execution(format!("stream catalog store get: {e}")) + }) + } + + fn delete(&self, key: &str) -> Result<()> { + self.db.delete(key.as_bytes()).map_err(|e| { + datafusion::common::DataFusionError::Execution(format!( + "stream catalog store delete: {e}" + )) + }) + } + + fn scan_prefix(&self, prefix: &str) -> Result)>> { + let mut out = Vec::new(); + let iter = self + .db + .iterator(IteratorMode::From(prefix.as_bytes(), Direction::Forward)); + for item in iter { + let (k, v) = item.map_err(|e| { + datafusion::common::DataFusionError::Execution(format!( + "stream catalog store scan: {e}" + )) + })?; + let key = String::from_utf8(k.to_vec()).map_err(|e| { + datafusion::common::DataFusionError::Execution(format!( + "stream catalog store: invalid utf8 key: {e}" + )) + })?; + if !key.starts_with(prefix) { + break; + } + out.push((key, v.to_vec())); + } + Ok(out) + } +} + +#[cfg(test)] +mod tests { + use std::path::PathBuf; + + use uuid::Uuid; + + use super::*; + + #[test] + fn put_get_scan_roundtrip() { + let dir: PathBuf = + std::env::temp_dir().join(format!("fs_stream_catalog_test_{}", Uuid::new_v4())); + let _ = std::fs::remove_dir_all(&dir); + + let store = RocksDbMetaStore::open(&dir).expect("open"); + store.put("catalog:stream_table:a", vec![1, 2, 3]).unwrap(); + store.put("catalog:stream_table:b", vec![4]).unwrap(); + store.put("other:x", vec![9]).unwrap(); + + assert_eq!( + store.get("catalog:stream_table:a").unwrap(), + Some(vec![1, 2, 3]) + ); + + let prefixed = store.scan_prefix("catalog:stream_table:").unwrap(); + assert_eq!(prefixed.len(), 2); + assert!(prefixed.iter().any(|(k, _)| k.ends_with(":a"))); + assert!(prefixed.iter().any(|(k, _)| k.ends_with(":b"))); + + store.delete("catalog:stream_table:a").unwrap(); + assert!(store.get("catalog:stream_table:a").unwrap().is_none()); + + let _ = std::fs::remove_dir_all(&dir); + } +} diff --git a/src/storage/task/mod.rs b/src/storage/task/mod.rs index b4b3680f..3123415a 100644 --- a/src/storage/task/mod.rs +++ b/src/storage/task/mod.rs @@ -16,6 +16,7 @@ pub mod factory; mod function_info; +mod proto_codec; mod rocksdb_storage; pub mod storage; diff --git a/src/storage/task/proto_codec.rs b/src/storage/task/proto_codec.rs new file mode 100644 index 00000000..6c1bc8df --- /dev/null +++ b/src/storage/task/proto_codec.rs @@ -0,0 +1,262 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Protobuf wire format for RocksDB task rows, with legacy bincode read support. + +use anyhow::{Context, Result, anyhow}; +use prost::Message; +use protocol::storage::{ + ComponentStateKind, ComponentStateProto, TaskMetadataProto, TaskModulePayloadProto, + TaskModulePython, TaskModuleWasm, task_module_payload_proto, +}; +use serde::{Deserialize, Serialize}; + +use crate::runtime::common::ComponentState; + +use super::storage::TaskModuleBytes; + +/// Magic prefix for protobuf-encoded task values (meta + payload). Legacy rows have no prefix. +pub const TASK_STORAGE_PROTO_MAGIC: &[u8; 4] = b"FSP1"; + +#[derive(Debug, Clone, Serialize, Deserialize)] +struct LegacyTaskMetadata { + task_type: String, + state: ComponentState, + created_at: u64, + checkpoint_id: Option, +} + +fn component_state_to_proto(state: &ComponentState) -> ComponentStateProto { + let (kind, error_message) = match state { + ComponentState::Uninitialized => (ComponentStateKind::Uninitialized, String::new()), + ComponentState::Initialized => (ComponentStateKind::Initialized, String::new()), + ComponentState::Starting => (ComponentStateKind::Starting, String::new()), + ComponentState::Running => (ComponentStateKind::Running, String::new()), + ComponentState::Checkpointing => (ComponentStateKind::Checkpointing, String::new()), + ComponentState::Stopping => (ComponentStateKind::Stopping, String::new()), + ComponentState::Stopped => (ComponentStateKind::Stopped, String::new()), + ComponentState::Closing => (ComponentStateKind::Closing, String::new()), + ComponentState::Closed => (ComponentStateKind::Closed, String::new()), + ComponentState::Error { error } => (ComponentStateKind::Error, error.clone()), + }; + ComponentStateProto { + kind: kind as i32, + error_message, + } +} + +fn component_state_from_proto(p: &ComponentStateProto) -> ComponentState { + let kind = ComponentStateKind::try_from(p.kind).unwrap_or(ComponentStateKind::Unspecified); + match kind { + ComponentStateKind::Unspecified | ComponentStateKind::Uninitialized => { + ComponentState::Uninitialized + } + ComponentStateKind::Initialized => ComponentState::Initialized, + ComponentStateKind::Starting => ComponentState::Starting, + ComponentStateKind::Running => ComponentState::Running, + ComponentStateKind::Checkpointing => ComponentState::Checkpointing, + ComponentStateKind::Stopping => ComponentState::Stopping, + ComponentStateKind::Stopped => ComponentState::Stopped, + ComponentStateKind::Closing => ComponentState::Closing, + ComponentStateKind::Closed => ComponentState::Closed, + ComponentStateKind::Error => ComponentState::Error { + error: if p.error_message.is_empty() { + "unknown error".to_string() + } else { + p.error_message.clone() + }, + }, + } +} + +/// Encode task metadata for `task_meta` column family (always protobuf + magic). +pub fn encode_task_metadata_bytes( + task_type: &str, + state: &ComponentState, + created_at: u64, + checkpoint_id: Option, +) -> Result> { + let proto = TaskMetadataProto { + task_type: task_type.to_string(), + state: Some(component_state_to_proto(state)), + created_at, + checkpoint_id, + }; + let mut out = TASK_STORAGE_PROTO_MAGIC.to_vec(); + proto.encode(&mut out).context("encode TaskMetadataProto")?; + Ok(out) +} + +pub struct DecodedTaskMetadata { + pub task_type: String, + pub state: ComponentState, + pub created_at: u64, + pub checkpoint_id: Option, +} + +/// Decode metadata written by this version (protobuf) or legacy bincode+serde. +pub fn decode_task_metadata_bytes(raw: &[u8]) -> Result { + if raw.len() >= TASK_STORAGE_PROTO_MAGIC.len() + && &raw[..TASK_STORAGE_PROTO_MAGIC.len()] == TASK_STORAGE_PROTO_MAGIC.as_slice() + { + let proto = TaskMetadataProto::decode(&raw[TASK_STORAGE_PROTO_MAGIC.len()..]) + .context("decode TaskMetadataProto")?; + let state = proto + .state + .as_ref() + .map(component_state_from_proto) + .unwrap_or_default(); + return Ok(DecodedTaskMetadata { + task_type: proto.task_type, + state, + created_at: proto.created_at, + checkpoint_id: proto.checkpoint_id, + }); + } + + let (legacy, _): (LegacyTaskMetadata, _) = + bincode::serde::decode_from_slice(raw, bincode::config::standard()) + .map_err(|e| anyhow!("legacy task metadata bincode decode failed: {e}"))?; + Ok(DecodedTaskMetadata { + task_type: legacy.task_type, + state: legacy.state, + created_at: legacy.created_at, + checkpoint_id: legacy.checkpoint_id, + }) +} + +fn module_to_proto(module: &TaskModuleBytes) -> TaskModulePayloadProto { + match module { + TaskModuleBytes::Wasm(bytes) => TaskModulePayloadProto { + payload: Some(task_module_payload_proto::Payload::Wasm(TaskModuleWasm { + wasm_binary: bytes.clone(), + })), + }, + TaskModuleBytes::Python { + class_name, + module, + bytes, + } => TaskModulePayloadProto { + payload: Some(task_module_payload_proto::Payload::Python( + TaskModulePython { + class_name: class_name.clone(), + module_path: module.clone(), + embedded_code: bytes.clone(), + }, + )), + }, + } +} + +/// Encode module payload for `task_payload` column family (always protobuf + magic). +pub fn encode_task_module_bytes(module: &TaskModuleBytes) -> Result> { + let proto = module_to_proto(module); + let mut out = TASK_STORAGE_PROTO_MAGIC.to_vec(); + proto + .encode(&mut out) + .context("encode TaskModulePayloadProto")?; + Ok(out) +} + +/// Decode module payload: protobuf+magic or legacy bincode+serde [`TaskModuleBytes`]. +pub fn decode_task_module_bytes(raw: &[u8]) -> Result { + if raw.len() >= TASK_STORAGE_PROTO_MAGIC.len() + && &raw[..TASK_STORAGE_PROTO_MAGIC.len()] == TASK_STORAGE_PROTO_MAGIC.as_slice() + { + let proto = TaskModulePayloadProto::decode(&raw[TASK_STORAGE_PROTO_MAGIC.len()..]) + .context("decode TaskModulePayloadProto")?; + return proto.try_into_task_module(); + } + + let (legacy, _): (TaskModuleBytes, _) = + bincode::serde::decode_from_slice(raw, bincode::config::standard()) + .map_err(|e| anyhow!("legacy task module bincode decode failed: {e}"))?; + Ok(legacy) +} + +trait TryIntoTaskModule { + fn try_into_task_module(self) -> Result; +} + +impl TryIntoTaskModule for TaskModulePayloadProto { + fn try_into_task_module(self) -> Result { + match self.payload { + Some(task_module_payload_proto::Payload::Wasm(w)) => { + Ok(TaskModuleBytes::Wasm(w.wasm_binary)) + } + Some(task_module_payload_proto::Payload::Python(p)) => Ok(TaskModuleBytes::Python { + class_name: p.class_name, + module: p.module_path, + bytes: p.embedded_code, + }), + None => Err(anyhow!("TaskModulePayloadProto missing payload")), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn metadata_roundtrip_proto() { + let enc = + encode_task_metadata_bytes("wasm", &ComponentState::Running, 42, Some(7)).unwrap(); + let dec = decode_task_metadata_bytes(&enc).unwrap(); + assert_eq!(dec.task_type, "wasm"); + assert_eq!(dec.state, ComponentState::Running); + assert_eq!(dec.created_at, 42); + assert_eq!(dec.checkpoint_id, Some(7)); + } + + #[test] + fn module_roundtrip_wasm_proto() { + let m = TaskModuleBytes::Wasm(vec![1, 2, 3]); + let enc = encode_task_module_bytes(&m).unwrap(); + let dec = decode_task_module_bytes(&enc).unwrap(); + assert_eq!(dec, m); + } + + #[test] + fn module_roundtrip_python_proto() { + let m = TaskModuleBytes::Python { + class_name: "C".into(), + module: "m".into(), + bytes: Some(vec![9]), + }; + let enc = encode_task_module_bytes(&m).unwrap(); + let dec = decode_task_module_bytes(&enc).unwrap(); + assert_eq!(dec, m); + } + + #[test] + fn legacy_bincode_metadata_still_decodes() { + let legacy = LegacyTaskMetadata { + task_type: "legacy".into(), + state: ComponentState::Stopped, + created_at: 99, + checkpoint_id: None, + }; + let raw = bincode::serde::encode_to_vec(&legacy, bincode::config::standard()).unwrap(); + let dec = decode_task_metadata_bytes(&raw).unwrap(); + assert_eq!(dec.task_type, "legacy"); + assert_eq!(dec.state, ComponentState::Stopped); + assert_eq!(dec.created_at, 99); + } + + #[test] + fn legacy_bincode_module_still_decodes() { + let m = TaskModuleBytes::Wasm(vec![8, 9]); + let raw = bincode::serde::encode_to_vec(&m, bincode::config::standard()).unwrap(); + assert_eq!(decode_task_module_bytes(&raw).unwrap(), m); + } +} diff --git a/src/storage/task/rocksdb_storage.rs b/src/storage/task/rocksdb_storage.rs index 31709a51..cea0ceb9 100644 --- a/src/storage/task/rocksdb_storage.rs +++ b/src/storage/task/rocksdb_storage.rs @@ -14,12 +14,15 @@ //! //! Uses three column families: task_meta, task_config, task_payload. -use super::storage::{StoredTaskInfo, TaskModuleBytes, TaskStorage}; +use super::proto_codec::{ + decode_task_metadata_bytes, decode_task_module_bytes, encode_task_metadata_bytes, + encode_task_module_bytes, +}; +use super::storage::{StoredTaskInfo, TaskStorage}; use crate::config::storage::RocksDBStorageConfig; use crate::runtime::common::ComponentState; use anyhow::{Context, Result, anyhow}; use rocksdb::{ColumnFamilyDescriptor, DB, IteratorMode, Options, WriteBatch}; -use serde::{Deserialize, Serialize}; use std::path::Path; use std::sync::Arc; @@ -27,14 +30,6 @@ const CF_METADATA: &str = "task_meta"; const CF_CONFIG: &str = "task_config"; const CF_PAYLOAD: &str = "task_payload"; -#[derive(Debug, Clone, Serialize, Deserialize)] -struct TaskMetadata { - task_type: String, - state: ComponentState, - created_at: u64, - checkpoint_id: Option, -} - pub struct RocksDBTaskStorage { db: Arc, } @@ -95,19 +90,19 @@ impl TaskStorage for RocksDBTaskStorage { return Err(anyhow!("Task uniqueness violation: {}", task_info.name)); } - let meta = TaskMetadata { - task_type: task_info.task_type.clone(), - state: task_info.state.clone(), - created_at: task_info.created_at, - checkpoint_id: task_info.checkpoint_id, - }; + let meta_bytes = encode_task_metadata_bytes( + &task_info.task_type, + &task_info.state, + task_info.created_at, + task_info.checkpoint_id, + )?; let mut batch = WriteBatch::default(); - batch.put_cf(&cf_meta, key, bincode::serialize(&meta)?); + batch.put_cf(&cf_meta, key, meta_bytes); batch.put_cf(&cf_conf, key, &task_info.config_bytes); if let Some(ref module) = task_info.module_bytes { - batch.put_cf(&cf_payl, key, bincode::serialize(module)?); + batch.put_cf(&cf_payl, key, encode_task_module_bytes(module)?); } self.db @@ -124,10 +119,19 @@ impl TaskStorage for RocksDBTaskStorage { .get_cf(&cf, key)? .ok_or_else(|| anyhow!("Task {} not found", task_name))?; - let mut meta: TaskMetadata = bincode::deserialize(&raw)?; - meta.state = new_state; - - self.db.put_cf(&cf, key, bincode::serialize(&meta)?)?; + let mut decoded = decode_task_metadata_bytes(&raw)?; + decoded.state = new_state; + + self.db.put_cf( + &cf, + key, + encode_task_metadata_bytes( + &decoded.task_type, + &decoded.state, + decoded.created_at, + decoded.checkpoint_id, + )?, + )?; Ok(()) } @@ -140,10 +144,19 @@ impl TaskStorage for RocksDBTaskStorage { .get_cf(&cf, key)? .ok_or_else(|| anyhow!("Task {} not found", task_name))?; - let mut meta: TaskMetadata = bincode::deserialize(&raw)?; - meta.checkpoint_id = checkpoint_id; - - self.db.put_cf(&cf, key, bincode::serialize(&meta)?)?; + let mut decoded = decode_task_metadata_bytes(&raw)?; + decoded.checkpoint_id = checkpoint_id; + + self.db.put_cf( + &cf, + key, + encode_task_metadata_bytes( + &decoded.task_type, + &decoded.state, + decoded.created_at, + decoded.checkpoint_id, + )?, + )?; Ok(()) } @@ -171,12 +184,12 @@ impl TaskStorage for RocksDBTaskStorage { .get_cf(&self.get_cf(CF_CONFIG)?, key)? .ok_or_else(|| anyhow!("Config missing: {}", task_name))?; - let module_bytes = self - .db - .get_cf(&self.get_cf(CF_PAYLOAD)?, key)? - .and_then(|b| bincode::deserialize::(&b).ok()); + let module_bytes = match self.db.get_cf(&self.get_cf(CF_PAYLOAD)?, key)? { + None => None, + Some(b) => Some(decode_task_module_bytes(&b)?), + }; - let meta: TaskMetadata = bincode::deserialize(&meta_raw)?; + let meta = decode_task_metadata_bytes(&meta_raw)?; Ok(StoredTaskInfo { name: task_name.to_string(), diff --git a/src/storage/task/storage.rs b/src/storage/task/storage.rs index 3c9e4080..156ee5d8 100644 --- a/src/storage/task/storage.rs +++ b/src/storage/task/storage.rs @@ -15,7 +15,7 @@ use anyhow::Result; use serde::{Deserialize, Serialize}; #[allow(dead_code)] -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub enum TaskModuleBytes { Wasm(Vec), Python {