diff --git a/.cargo/config.toml b/.cargo/config.toml index 6c3ced4eaa..01ff41a155 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -12,7 +12,8 @@ CARGO_LLVM_COV_TARGET_DIR = { value = "target/llvm-cov/build", relative = true, CARGO_LLVM_COV_BUILD_DIR = { value = "target/llvm-cov/target", relative = true, force = false } [build] -rustflags = ["--cfg=tokio_unstable"] +# Register `emulated` so cfg_attr sites don't trip unexpected_cfgs natively. +rustflags = ["--cfg=tokio_unstable", "--check-cfg=cfg(emulated)"] [target.wasm32-wasip1] # Trailing `--` separates wasmtime's CLI from the module + module args @@ -20,5 +21,17 @@ rustflags = ["--cfg=tokio_unstable"] # wasm module instead of being parsed as a wasmtime option. runner = "wasmtime run --dir . --" +[target.x86_64-unknown-linux-gnu] +runner = "scripts/test-runner.sh x86_64" + +[target.x86_64-unknown-linux-musl] +runner = "scripts/test-runner.sh x86_64" + +[target.aarch64-unknown-linux-gnu] +runner = "scripts/test-runner.sh aarch64" + +[target.aarch64-unknown-linux-musl] +runner = "scripts/test-runner.sh aarch64" + [alias] nt = "nextest" diff --git a/.config/nextest.toml b/.config/nextest.toml index 7eb035b6ff..5b23136755 100644 --- a/.config/nextest.toml +++ b/.config/nextest.toml @@ -146,3 +146,8 @@ vm = { max-threads = 1 } [profile.miri] slow-timeout = { period = "500s" } + +[profile.cross-qemu] +slow-timeout = { period = "60s", terminate-after = 5 } +fail-fast = false +failure-output = "immediate-final" diff --git a/.github/actions/sticky-pr-comment/action.yml b/.github/actions/sticky-pr-comment/action.yml new file mode 100644 index 0000000000..cd721150ab --- /dev/null +++ b/.github/actions/sticky-pr-comment/action.yml @@ -0,0 +1,84 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright Open Network Fabric Authors + +# Create-or-update a sticky PR comment keyed by an HTML-comment marker. +# Caller needs `pull-requests: write`. + +name: "Sticky PR comment" +description: "Create-or-update a sticky comment on the current pull request, keyed by marker" + +inputs: + marker: + description: "HTML-comment marker (e.g. ``) uniquely identifying this comment. Prepended to the body automatically." + required: true + body: + description: "Markdown body of the comment (without the marker)." + required: true + pr-number: + description: "Pull request number to comment on." + default: "${{ github.event.pull_request.number }}" + token: + description: "GitHub token with `pull-requests: write`." + default: "${{ github.token }}" + +runs: + using: "composite" + steps: + - name: "Create or update sticky comment" + uses: "actions/github-script@v8" + env: + STICKY_MARKER: "${{ inputs.marker }}" + STICKY_BODY: "${{ inputs.body }}" + STICKY_PR_NUMBER: "${{ inputs.pr-number }}" + with: + github-token: "${{ inputs.token }}" + script: | + const marker = process.env.STICKY_MARKER; + const userBody = process.env.STICKY_BODY; + const prNumber = Number.parseInt(process.env.STICKY_PR_NUMBER, 10); + if (!marker || !marker.startsWith('')) { + core.setFailed( + `sticky-pr-comment: 'marker' must be an HTML comment ` + + `(\`\`); got: ${JSON.stringify(marker)}`, + ); + return; + } + if (!Number.isInteger(prNumber) || prNumber <= 0) { + core.setFailed( + `sticky-pr-comment: 'pr-number' must be a positive integer; ` + + `got: ${JSON.stringify(process.env.STICKY_PR_NUMBER)}. ` + + `(Are you invoking this on a non-pull_request event without ` + + `passing pr-number explicitly?)`, + ); + return; + } + const body = `${marker}\n${userBody}`; + const comments = await github.paginate( + github.rest.issues.listComments, + { + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + per_page: 100, + }, + ); + const existing = comments.find( + c => typeof c.body === 'string' && c.body.startsWith(marker), + ); + if (existing) { + await github.rest.issues.updateComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: existing.id, + body, + }); + core.info(`sticky-pr-comment: updated comment ${existing.id}`); + } else { + const { data: created } = await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + body, + }); + core.info(`sticky-pr-comment: created comment ${created.id}`); + } diff --git a/.github/workflows/bump.yml b/.github/workflows/bump.yml index 3f325ab398..10438b42ea 100644 --- a/.github/workflows/bump.yml +++ b/.github/workflows/bump.yml @@ -149,9 +149,12 @@ jobs: token: "${{ steps.app-token.outputs.token }}" branch: "bump/cargo-upgrades" title: "bump(cargo)!: :rocket: upgrades available" + # ci:+cross/full opts the bump PR into the full cross matrix + # (build + qemu-user test on musl) to catch upstream regressions. labels: | automated dependencies + ci:+cross/full signoff: "true" sign-commits: "true" body: | diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 87387d0ce9..e708fa9c9d 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -447,6 +447,7 @@ jobs: github.event_name == 'pull_request' && ( contains(github.event.pull_request.labels.*.name, 'ci:+cross') + || contains(github.event.pull_request.labels.*.name, 'ci:+cross/full') ) || (github.event_name == 'push' || github.event_name == 'merge_group') }} @@ -489,6 +490,25 @@ jobs: steps: - *checkout - *nix-setup + - name: "test" + if: >- + ${{ + matrix.recipe.args == 'dataplane' + && matrix.libc == 'musl' + && github.event_name == 'pull_request' + && ( + contains(github.event.pull_request.labels.*.name, 'ci:+cross') + || contains(github.event.pull_request.labels.*.name, 'ci:+cross/full') + ) + }} + uses: *just + env: + JUST_VARS: >- + platform=${{ matrix.platform }} + profile=${{ matrix.profile }} + libc=${{ matrix.libc }} + with: + recipe: "test" - name: "${{ matrix.platform }}/${{ matrix.libc }}/${{ matrix.profile }}/${{ matrix.recipe.args }}" uses: *just env: @@ -535,25 +555,6 @@ jobs: uses: *just with: recipe: "test" - - name: "shuttle_pct" - env: - JUST_VARS: >- - docker_sock=/run/docker/docker.sock - debug_justfile=${{ github.event_name == 'workflow_dispatch' && github.event.inputs.debug_justfile || false }} - profile=release - features=shuttle_pct - oci_repo=ghcr.io - uses: *just - with: - recipe: "test" - # The `loom` feature flips `concurrency::sync` to loom's - # primitives workspace-wide, which breaks crates that rely on - # `Weak`, `Arc::downgrade`, etc. (those aren't in - # `loom::sync`). Scope the loom build to only the concurrency - # package (which hosts the core concurrency tests) so workspace - # feature unification doesn't poison unrelated crates. - # - # TODO: gate tests which can't be used with loom - name: "loom" env: JUST_VARS: >- @@ -565,7 +566,6 @@ jobs: uses: *just with: recipe: "test" - recipe_args: "concurrency" - *tmate vlab: @@ -655,6 +655,13 @@ jobs: summary: name: "Summary" runs-on: "ubuntu-latest" + # `pull-requests: write` is for the "Surface cross failures" step. + # `actions: read` lets that same step query the cross matrix's real + # job outcomes via the REST API. + permissions: + actions: "read" + contents: "read" + pull-requests: "write" needs: - check - sanitize @@ -713,6 +720,76 @@ jobs: run: | echo '::error:: Some vlab job(s) failed' exit 1 + # Cross is advisory (continue-on-error: true at the job level), + # which makes `needs.cross.result` always report `success` to + # dependents -- the very property that keeps leg flakes from + # blocking merge also hides genuine failures from this gate. + # Read the underlying matrix job outcomes from the REST API + # instead, then exit 1 only via the sticky comment path. + - name: "Detect cross matrix failures" + id: "cross_status" + if: >- + ${{ + github.event_name == 'pull_request' + && needs.cross.result != 'skipped' + }} + uses: "actions/github-script@v8" + with: + script: | + // Cross matrix jobs are named + // `${recipe.name}/${recipe.args}/${platform}/${libc}`. + // `recipe.name` is always "build-container" in the `cross` + // job's matrix, so the prefix uniquely identifies them. + const jobs = await github.paginate( + github.rest.actions.listJobsForWorkflowRun, + { + owner: context.repo.owner, + repo: context.repo.repo, + run_id: context.runId, + per_page: 100, + }, + ); + const cross = jobs.filter(j => j.name.startsWith('build-container/')); + const failed = cross.filter(j => j.conclusion === 'failure'); + core.setOutput('failed_count', String(failed.length)); + core.setOutput('failed_names', failed.map(j => j.name).join(', ')); + const heading = failed.length === 0 + ? `All ${cross.length} cross matrix entries passed.` + : `${failed.length} of ${cross.length} cross matrix entries failed.`; + await core.summary.addHeading('Cross build advisory').addRaw(heading).write(); + # The sticky-comment composite action is a local action; it has + # to be on the runner's disk for `uses:` to resolve. The summary + # job has no other reason to check the repo out, so do a sparse + # checkout of just the action directory, gated on the same + # failure condition as the comment step itself. + - name: "Checkout sticky-pr-comment action" + if: >- + ${{ + github.event_name == 'pull_request' + && steps.cross_status.outputs.failed_count != '' + && steps.cross_status.outputs.failed_count != '0' + }} + uses: "actions/checkout@v6" + with: + persist-credentials: "false" + sparse-checkout: ".github/actions/sticky-pr-comment" + sparse-checkout-cone-mode: "false" + - name: "Surface cross failures via sticky PR comment" + if: >- + ${{ + github.event_name == 'pull_request' + && steps.cross_status.outputs.failed_count != '' + && steps.cross_status.outputs.failed_count != '0' + }} + uses: "./.github/actions/sticky-pr-comment" + with: + marker: "" + body: | + ## :warning: Cross build advisory: failure detected :warning: + + [Please investigate before merging](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) + + Failed legs: ${{ steps.cross_status.outputs.failed_names }} publish: runs-on: lab diff --git a/.github/workflows/lint-opengrep.yml b/.github/workflows/lint-opengrep.yml index b64c54c2eb..2438f2634a 100644 --- a/.github/workflows/lint-opengrep.yml +++ b/.github/workflows/lint-opengrep.yml @@ -40,4 +40,6 @@ jobs: shell: "bash" run: | set -euo pipefail - opengrep scan --experimental --verbose --error + opengrep scan --experimental --verbose --error \ + --config auto \ + --config .semgrep/rules diff --git a/.gitignore b/.gitignore index 6d529d2d89..8450062587 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ **.profraw **/__fuzz__/** +# qemu-user core dumps from SIGABRT under emulated tests. +**/qemu_*.core result* result*/** **/target/** diff --git a/.semgrep/rules/no-std-sync-direct.yaml b/.semgrep/rules/no-std-sync-direct.yaml new file mode 100644 index 0000000000..87045b07af --- /dev/null +++ b/.semgrep/rules/no-std-sync-direct.yaml @@ -0,0 +1,80 @@ +# Path-aware enforcement of the `concurrency::sync` facade. +# +# `clippy::disallowed_types` resolves through re-exports: it sees +# `concurrency::sync::Arc` as `std::sync::Arc` (same canonical path), +# so it cannot distinguish "imported via the workspace facade" from +# "imported directly from std::sync". opengrep's source-level pattern +# matching has no such limit — it sees what was written, not what was +# resolved. +# +# These rules forbid both `use std::sync::X;` imports and bare +# fully-qualified path expressions (`std::sync::X::new(...)`, +# `std::sync::X`) for any type the `concurrency::sync` facade +# exposes. Switching to `concurrency::sync` is the migration that lets +# future loom/shuttle test builds hook these types via the backend +# cfg. The lock types (Mutex/RwLock and guards) are also caught by +# `clippy::disallowed_types` because parking_lot provides a distinct +# production type; this rule is the only enforcement for Arc / Weak / +# atomic / LazyLock / OnceLock / Once / Barrier / Condvar. +# +# The `concurrency` crate itself is exempt (it IS the facade) and so are +# its tests (which use the facade by design but are flagged by clippy's +# alias resolution; that allow is per-file). Anywhere else, a hit means +# "switch to `use concurrency::sync::X;` and drop the FQN". +rules: + - id: rust-no-direct-std-sync-import + languages: [rust] + severity: ERROR + message: | + Import synchronization primitives via `concurrency::sync::*`, not + `std::sync::*`. The workspace's `concurrency::sync` facade hands + you parking_lot-backed locks in production and loom/shuttle + equivalents under the model-checker features; importing directly + from `std::sync` bypasses that backend swap. + paths: + exclude: + # The wrapper layer is allowed to use std::sync directly. + - concurrency/src/sync/ + # quiescent/slot are concurrency-internal users of the facade + # that still parse as std::sync because the lint matches paths + # textually here too. + - concurrency/src/quiescent.rs + - concurrency/src/slot.rs + # Tests for the concurrency crate use the wrapped types + # deliberately; the allow is at the file head. + - concurrency/tests/ + pattern-either: + # Direct imports: `use std::sync::X;`. + - pattern: use std::sync::Arc; + - pattern: use std::sync::Weak; + - pattern: use std::sync::Mutex; + - pattern: use std::sync::MutexGuard; + - pattern: use std::sync::RwLock; + - pattern: use std::sync::RwLockReadGuard; + - pattern: use std::sync::RwLockWriteGuard; + - pattern: use std::sync::OnceLock; + - pattern: use std::sync::LazyLock; + - pattern: use std::sync::Once; + - pattern: use std::sync::Barrier; + - pattern: use std::sync::Condvar; + - pattern: use std::sync::atomic; + - pattern: use std::sync::atomic::$X; + - pattern: use std::sync::PoisonError; + # Grouped imports across one or many lines: + # `use std::sync::{Arc, Mutex};` and the multi-line variant. + # AST-level matching for `use ... :: { ... }` is hit-and-miss + # across tree-sitter versions; a regex with `(?s)` (dotall) + # spans newlines without anchoring on the import keyword's + # column. + - pattern-regex: '(?s)use\s+std::sync::\{[^}]*\b(?:Arc|Weak|Mutex|MutexGuard|RwLock|RwLockReadGuard|RwLockWriteGuard|OnceLock|LazyLock|Once|Barrier|Condvar|PoisonError|atomic)\b' + # Fully-qualified path expressions / type references that skip + # the `use` line entirely (e.g. `static X: std::sync::LazyLock + # = std::sync::LazyLock::new(...)`, or inline turbofish + # like `std::sync::Arc::::new`). Bare `pattern: std::sync + # ::X` does not fire reliably in opengrep's Rust frontend + # across path-vs-type contexts; a regex is the dependable + # backstop. The leading `(?m)^(?!\s*(?://|\*))` skips lines + # whose first non-whitespace is a `//` line/doc comment or a + # block-comment continuation `*`, so rustdoc intra-doc links + # like `/// [std::sync::Arc]` don't false-positive. + - pattern-regex: '(?m)^(?!\s*(?://|\*)).*\bstd::sync::(?:Arc|Weak|Mutex|MutexGuard|RwLock|RwLockReadGuard|RwLockWriteGuard|OnceLock|LazyLock|Once|Barrier|Condvar|PoisonError|atomic::[A-Za-z_][A-Za-z0-9_]*)\b' diff --git a/Cargo.lock b/Cargo.lock index 232ff704f4..b29b22688e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1218,6 +1218,7 @@ version = "0.21.0" dependencies = [ "clap", "colored", + "dataplane-concurrency", "nix 0.31.3", "rand 0.10.1", "reedline", @@ -1267,6 +1268,7 @@ dependencies = [ "caps", "chrono", "dataplane-common", + "dataplane-concurrency", "dataplane-k8s-intf", "dataplane-lpm", "dataplane-net", @@ -1416,6 +1418,7 @@ name = "dataplane-interface-manager" version = "0.21.0" dependencies = [ "bolero", + "dataplane-concurrency", "dataplane-net", "dataplane-rekon", "derive_builder", @@ -1436,6 +1439,7 @@ name = "dataplane-k8s-intf" version = "0.21.0" dependencies = [ "bolero", + "dataplane-concurrency", "dataplane-dpdk-sysroot-helper", "dataplane-lpm", "dataplane-net", @@ -1472,6 +1476,7 @@ name = "dataplane-left-right-tlcache" version = "0.21.0" dependencies = [ "ahash", + "dataplane-concurrency", "left-right 0.11.7 (git+https://github.com/githedgehog/left-right.git?branch=fredi%2Ffix-writehandle-drop)", "serial_test", "thiserror", @@ -1689,6 +1694,7 @@ dependencies = [ name = "dataplane-sysfs" version = "0.21.0" dependencies = [ + "dataplane-concurrency", "n-vm", "nix 0.31.3", "procfs", @@ -1713,6 +1719,7 @@ version = "0.21.0" dependencies = [ "color-eyre", "dataplane-common", + "dataplane-concurrency", "linkme", "ordermap", "serial_test", diff --git a/args/src/lib.rs b/args/src/lib.rs index 28c4728ee0..63e37f2a10 100644 --- a/args/src/lib.rs +++ b/args/src/lib.rs @@ -1511,4 +1511,10 @@ mod tests { // bad discriminant assert!(InterfaceArg::from_str("GbEth1.9000=foo@0000:02:01.7").is_err()); } + + #[cfg(target_arch = "aarch64")] + #[test] + fn intentional_fail() { + panic!("oh no! A regression on aarch64! We should reject this commit!") + } } diff --git a/cli/Cargo.toml b/cli/Cargo.toml index cfb032e2fa..bd61ca5883 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -12,6 +12,7 @@ path = "bin/main.rs" [dependencies] clap = { workspace = true, features = ["derive", "std", "usage"] } +concurrency = { workspace = true } colored = { workspace = true, features = [] } nix = { workspace = true, features = ["socket"] } rkyv = { workspace = true, features = ["alloc", "bytecheck", "std"] } diff --git a/cli/bin/completions.rs b/cli/bin/completions.rs index 64d370bfb8..8b757da0d4 100644 --- a/cli/bin/completions.rs +++ b/cli/bin/completions.rs @@ -4,8 +4,8 @@ //! Adds command completions use crate::cmdtree::Node; +use concurrency::sync::Arc; use reedline::{Completer, Span, Suggestion}; -use std::sync::Arc; #[derive(Default)] pub struct CmdCompleter { diff --git a/cli/bin/main.rs b/cli/bin/main.rs index 3b7801b4f7..e38488f43f 100644 --- a/cli/bin/main.rs +++ b/cli/bin/main.rs @@ -12,11 +12,11 @@ use cmdline::Cmdline; use cmdtree::Node; use cmdtree_dp::gw_cmd_tree; use colored::Colorize; +use concurrency::sync::Arc; use dataplane_cli::cliproto::CliLocalError; use dataplane_cli::cliproto::{CliAction, CliRequest, CliResponse}; use std::io::stdin; use std::os::unix::net::UnixDatagram; -use std::sync::Arc; use terminal::{TermInput, Terminal}; mod argsparse; diff --git a/cli/bin/terminal.rs b/cli/bin/terminal.rs index fee7677a7c..b7af08034b 100644 --- a/cli/bin/terminal.rs +++ b/cli/bin/terminal.rs @@ -12,6 +12,7 @@ use reedline::{ PromptHistorySearch, Reedline, ReedlineEvent, ReedlineMenu, Signal, default_emacs_keybindings, }; +use concurrency::sync::Arc; use std::borrow::Cow; use std::collections::HashMap; use std::collections::VecDeque; @@ -22,7 +23,6 @@ use std::net::Shutdown; use std::os::unix::fs::PermissionsExt; use std::os::unix::net::UnixDatagram; use std::path::Path; -use std::sync::Arc; // our completer use crate::completions::CmdCompleter; diff --git a/cli/src/cliproto.rs b/cli/src/cliproto.rs index da0c47d0a4..1d463fbfcd 100644 --- a/cli/src/cliproto.rs +++ b/cli/src/cliproto.rs @@ -437,7 +437,10 @@ mod tests { /// Open 2 sockets, one for dataplane and one for cli. Spawn a thread representing dataplane. /// Send dataplane a request and receive a big response from it. #[test] - #[cfg_attr(miri, ignore = "miri does not support Unix sockets")] + #[cfg_attr( + emulated, + ignore = "Unix sockets unsupported under miri / flaky under qemu-user" + )] fn test_communications() { const DP_PATH: &str = "/tmp/dpsock"; const CLI_PATH: &str = "/tmp/clisock"; diff --git a/clippy.toml b/clippy.toml index 711aa2ce79..0060be8ee0 100644 --- a/clippy.toml +++ b/clippy.toml @@ -2,3 +2,11 @@ allow-expect-in-tests = true # panic is completely reasonable in a test setting allow-panic-in-tests = true # panic is completely reasonable in a test setting allow-unwrap-in-tests = true # panic is completely reasonable in a test setting +disallowed-types = [ + # import via concurrency crate! + "std::sync::Mutex", + "std::sync::MutexGuard", + "std::sync::RwLock", + "std::sync::RwLockReadGuard", + "std::sync::RwLockWriteGuard", +] diff --git a/common/src/cliprovider.rs b/common/src/cliprovider.rs index 8389df2b24..01a32b8fa1 100644 --- a/common/src/cliprovider.rs +++ b/common/src/cliprovider.rs @@ -90,7 +90,7 @@ where T: CliDataProvider, { fn provide(&self) -> String { - self.load_full().provide() + self.load().provide() } } diff --git a/concurrency-macros/src/lib.rs b/concurrency-macros/src/lib.rs index 8d2ee227eb..d643a6ec5d 100644 --- a/concurrency-macros/src/lib.rs +++ b/concurrency-macros/src/lib.rs @@ -108,10 +108,9 @@ pub fn concurrency_mode(attr: TokenStream, item: TokenStream) -> TokenStream { /// `#[test] fn () { concurrency::stress(|| { original }) }`, /// which calls the body once. /// -/// Under any model-checker backend (`loom`, `shuttle`, `shuttle_pct`, -/// `shuttle_dfs`), expands to a nested module so the test's binary -/// path identifies the active backend in nextest reports / JUnit -/// output: +/// Under either model-checker backend (`loom`, `shuttle`), expands to +/// a nested module so the test's binary path identifies the active +/// backend in nextest reports / JUnit output: /// /// ```text /// // #[concurrency::test] fn some_test() { body } @@ -124,11 +123,14 @@ pub fn concurrency_mode(attr: TokenStream, item: TokenStream) -> TokenStream { /// } /// ``` /// -/// The same shape applies for `shuttle` / `shuttle_pct` / `shuttle_dfs`, -/// each writing the function name that names the active backend. -/// Nextest filters like `-E 'test(/concurrency_model::loom$/)'` then -/// pick out the loom-backed runs cleanly without having to grep on -/// binary names. +/// The same shape applies for `shuttle`, writing the function name +/// that names the active backend. Nextest filters like `-E +/// 'test(/concurrency_model::loom$/)'` then pick out the loom-backed +/// runs cleanly without having to grep on binary names. Under +/// `shuttle` the body runs through `concurrency::stress`, which +/// internally drives a [`shuttle::PortfolioRunner`] (Random + PCT, +/// optionally DFS) -- the macro only emits one leaf test per +/// backend. /// /// # Example /// @@ -146,13 +148,14 @@ pub fn concurrency_mode(attr: TokenStream, item: TokenStream) -> TokenStream { /// /// # Limitations /// -/// * **Single-threaded bodies fail under `shuttle_pct`.** Shuttle's PCT -/// scheduler panics at runtime if the test closure does not exercise -/// any concurrent atomic / thread operation (no `thread::spawn`, no -/// contended `Mutex`/`Arc`). The detection is dynamic, so the macro -/// cannot reject these statically; if you need such a test, gate it -/// with `#[cfg(not(feature = "shuttle_pct"))]` or use a regular -/// `#[test]` for the default-only smoke check. +/// * **Single-threaded bodies fail under `shuttle`.** The shuttle +/// portfolio runs the PCT scheduler unconditionally, and PCT +/// panics at runtime if the test closure does not exercise any +/// concurrent atomic / thread operation on the main thread (no +/// `thread::spawn`, no contended `Mutex`/`Arc`). The detection is +/// dynamic, so the macro cannot reject these statically; if you +/// need such a test, gate it with `#[cfg(not(feature = "shuttle"))]` +/// or use a regular `#[test]` for the default-only smoke check. /// * **Async bodies and arguments are rejected at parse time** with a /// clear compile error. #[proc_macro_attribute] @@ -215,26 +218,12 @@ pub fn test(_attr: TokenStream, item: TokenStream) -> TokenStream { #krate::stress(|| #block); } - #[cfg(all(feature = "shuttle", not(feature = "shuttle_pct")))] + #[cfg(feature = "shuttle")] #[::core::prelude::v1::test] #(#attrs)* fn shuttle() { #krate::stress(|| #block); } - - #[cfg(all(feature = "shuttle_pct", not(feature = "shuttle_dfs")))] - #[::core::prelude::v1::test] - #(#attrs)* - fn shuttle_pct() { - #krate::stress(|| #block); - } - - #[cfg(feature = "shuttle_dfs")] - #[::core::prelude::v1::test] - #(#attrs)* - fn shuttle_dfs() { - #krate::stress(|| #block); - } } } } diff --git a/concurrency/Cargo.toml b/concurrency/Cargo.toml index 3cd3e072dd..1bb63dda73 100644 --- a/concurrency/Cargo.toml +++ b/concurrency/Cargo.toml @@ -9,28 +9,10 @@ version.workspace = true default = ["parking_lot"] loom = ["dep:loom", "concurrency-macros/loom"] parking_lot = ["dep:parking_lot"] -# `shuttle*` are three views of one backend, not three backends: -# -# * `shuttle` -- shuttle with the random scheduler (the default -# for first-time users -- you almost always want -# this one). -# * `shuttle_pct` -- shuttle with the PCT scheduler. Use when you -# want to bias toward rare interleavings. -# * `shuttle_dfs` -- shuttle with the DFS scheduler. Use for -# exhaustive small-state exploration. -# -# All three share the same `dep:shuttle` machinery; only the scheduler -# selected at runtime differs. See `concurrency::stress` for the -# dispatch and the `#[concurrency::test]` attribute macro for the -# write-once-run-everywhere wrapper. -# -# The features form a chain (`shuttle_dfs` -> `shuttle_pct` -> `shuttle`) -# so that any `feature = "shuttle"` cfg check is true under all three -# variants. cfg_select-style precedence (most-specific first) still -# picks the right scheduler. See `concurrency::stress`. +# `shuttle` drives a `PortfolioRunner` (Random + PCT) via `concurrency::stress`. +# `shuttle_dfs` additively adds the DFS scheduler to the same portfolio. shuttle = ["dep:shuttle", "concurrency-macros/shuttle"] -shuttle_pct = ["shuttle"] -shuttle_dfs = ["shuttle_pct"] +shuttle_dfs = ["shuttle"] silence_clippy = ["concurrency-macros/silence_clippy"] # Do not manually enable this feature, let --all-features do it for you # Force the Mutex-based slot fallback even without a model checker. # Intended for the CI miri job that exercises the fallback slot under diff --git a/concurrency/src/quiescent.rs b/concurrency/src/quiescent.rs index 81ed7514a8..db19a0cba2 100644 --- a/concurrency/src/quiescent.rs +++ b/concurrency/src/quiescent.rs @@ -2,6 +2,7 @@ // Copyright Open Network Fabric Authors #![doc = include_str!("../QUIESCENT.md")] +#![allow(clippy::disallowed_types)] use core::cell::{Cell, RefCell}; use core::marker::PhantomData; diff --git a/concurrency/src/slot.rs b/concurrency/src/slot.rs index 291fbd846b..7d13c02eb2 100644 --- a/concurrency/src/slot.rs +++ b/concurrency/src/slot.rs @@ -48,6 +48,10 @@ cfg_select! { Self(Mutex::new(value)) } + pub fn load(&self) -> Arc { + self.load_full() + } + pub fn load_full(&self) -> Arc { let guard = self.0.lock(); Arc::clone(&*guard) @@ -84,6 +88,10 @@ cfg_select! { Self(Mutex::new(value)) } + pub fn load(&self) -> Option> { + self.load_full() + } + pub fn load_full(&self) -> Option> { let guard = self.0.lock(); guard.as_ref().map(Arc::clone) @@ -125,6 +133,11 @@ cfg_select! { Self(ArcSwap::new(value)) } + #[inline] + pub fn load(&self) -> arc_swap::Guard> { + self.0.load() + } + #[inline] pub fn load_full(&self) -> Arc { self.0.load_full() @@ -165,6 +178,11 @@ cfg_select! { Self(ArcSwapOption::new(value)) } + #[inline] + pub fn load(&self) -> arc_swap::Guard>> { + self.0.load() + } + #[inline] pub fn load_full(&self) -> Option> { self.0.load_full() diff --git a/concurrency/src/stress.rs b/concurrency/src/stress.rs index 243b48ec2b..884a359c5b 100644 --- a/concurrency/src/stress.rs +++ b/concurrency/src/stress.rs @@ -6,53 +6,76 @@ //! [`stress`] runs `body` under whichever concurrency backend the crate //! was compiled against: //! -//! * default backend -- direct call, no scheduling exploration -//! * `loom` feature -- `loom::model` -//! * `shuttle` feature -- `shuttle::check_random` -//! * `shuttle_pct` feature -- `shuttle::check_pct` -//! * `shuttle_dfs` feature -- `shuttle::check_dfs` (capped at `ITERATIONS`) +//! * default backend -- direct call, no scheduling exploration. +//! * `loom` feature -- `loom::model`. +//! * `shuttle` feature -- a [`shuttle::PortfolioRunner`] that runs +//! `RandomScheduler` and `PctScheduler` in parallel; any scheduler +//! finding a failing execution fails the whole test. +//! * `shuttle_dfs` (additive on top of `shuttle`) -- also adds +//! `DfsScheduler` to the portfolio. //! -//! `lib.rs` `compile_error!`s if both `loom` and any `shuttle*` are -//! enabled at once, so only one branch should ever fire in a real -//! build. Under `--all-features` the `silence_clippy` escape hatch -//! suppresses that error and the `cfg_select!` below resolves the -//! arms in this order: `loom > shuttle_dfs > shuttle_pct > shuttle`. -//! Same precedence the routing in `concurrency::sync` uses. -//! -//! Tests written once exercise any of these by toggling features on the -//! crate. The `#[concurrency::test]` attribute (in `concurrency-macros`) -//! is a thin wrapper that calls this function for you. +//! Both backends bump the coroutine stack to 4 MiB: shuttle/loom wrap +//! every primitive with bookkeeping (an `AtomicBool` runs ~100 bytes), +//! so the heaviest workspace test (NAT allocator's per-block atomic +//! arrays) blows through the defaults (32 KiB shuttle, 4 KiB loom). +//! For loom the bump is implemented by spawning the body inside a +//! `Builder::stack_size`-configured thread, since `loom::model::Builder` +//! exposes no equivalent knob -- this costs one of loom's 5 thread +//! slots, fine for every test today. /// Run `body` under the currently selected concurrency backend. -/// -/// See the module docs for the per-backend dispatch table. +#[allow(unused_variables)] // `body` may be unused in arms that don't take a closure. +#[allow(clippy::expect_used)] // backend spawn / join: panic-on-failure is the right semantic in a test harness. pub fn stress(body: F) where F: Fn() + Send + Sync + 'static, { - // The feature lattice in `Cargo.toml` makes `feature = "shuttle"` - // true under any shuttle variant, so the const-cfgs here are - // correspondingly simple: ITERATIONS is needed by any shuttle arm, - // SCHEDULES is only consumed by the shuttle_pct arm. #[cfg(all(not(feature = "loom"), feature = "shuttle"))] - const ITERATIONS: usize = 16; - #[cfg(all( - not(feature = "loom"), - not(feature = "shuttle_dfs"), - feature = "shuttle_pct" - ))] - const SCHEDULES: usize = 3; + const ITERATIONS: usize = 256; + #[cfg(all(not(feature = "loom"), feature = "shuttle"))] + const SCHEDULES: usize = 32; + + #[cfg(feature = "loom")] + const LOOM_STACK_SIZE: usize = 4 * 1024 * 1024; + cfg_select! { - feature = "loom" => { loom::model(body); }, - feature = "shuttle_dfs" => { shuttle::check_dfs(body, Some(ITERATIONS)); }, - feature = "shuttle_pct" => { shuttle::check_pct(body, ITERATIONS, SCHEDULES); }, - feature = "shuttle" => { shuttle::check_random(body, ITERATIONS); }, + feature = "loom" => { + // `loom::model::Builder::check` requires `Fn + Sync + Send + 'static` + // but the inner spawn takes `body` by `FnOnce`, so wrap in `Arc`. + // This Arc is shared *across* `loom::model` invocations -- it lives + // outside loom's executor and must remain `std::sync::Arc`; using + // the facade's loom-backend Arc would tie its lifetime to a single + // model run. + let body = std::sync::Arc::new(body); // nosemgrep: rust-no-direct-std-sync-import + loom::model(move || { + let body = body.clone(); + loom::thread::Builder::new() + .stack_size(LOOM_STACK_SIZE) + .spawn(move || body()) + .expect("loom thread spawn") + .join() + .expect("loom body panicked"); + }); + }, + feature = "shuttle" => { + use shuttle::PortfolioRunner; + use shuttle::scheduler::{PctScheduler, RandomScheduler}; + + // `shuttle::Config` is `#[non_exhaustive]`, so mutate `default()`. + let mut config = shuttle::Config::default(); + config.stack_size = 4 * 1024 * 1024; + + let mut portfolio = PortfolioRunner::new(true, config); + portfolio.add(RandomScheduler::new(ITERATIONS)); + portfolio.add(PctScheduler::new(SCHEDULES, ITERATIONS)); + #[cfg(feature = "shuttle_dfs")] + { + use shuttle::scheduler::DfsScheduler; + portfolio.add(DfsScheduler::new(Some(ITERATIONS), false)); + } + portfolio.run(body); + }, not(any(feature = "loom", feature = "shuttle")) => { body(); }, - _ => compile_error!( - "stress: a model-checker feature is enabled but no dispatch \ - arm matched. Either an explicit arm above is missing, or \ - the `not(any(...))` default needs widening to cover the \ - new feature.", - ), + _ => compile_error!("stress: no dispatch arm matched the active feature set"), } } diff --git a/concurrency/src/sync/loom_backend.rs b/concurrency/src/sync/loom_backend.rs index 50a39b6406..7d09ea777e 100644 --- a/concurrency/src/sync/loom_backend.rs +++ b/concurrency/src/sync/loom_backend.rs @@ -50,8 +50,9 @@ use core::ops::{Deref, DerefMut}; use loom::sync as inner; pub use loom::sync::{Barrier, Condvar, WaitTimeoutResult, atomic, mpsc}; +#[allow(clippy::disallowed_types)] pub use std::sync::{ - BarrierWaitResult, LockResult, Once, OnceLock, OnceState, PoisonError, TryLockError, + BarrierWaitResult, LazyLock, LockResult, Once, OnceLock, OnceState, PoisonError, TryLockError, TryLockResult, }; diff --git a/concurrency/src/sync/mod.rs b/concurrency/src/sync/mod.rs index 2ea01b9a6f..0545614964 100644 --- a/concurrency/src/sync/mod.rs +++ b/concurrency/src/sync/mod.rs @@ -11,10 +11,11 @@ //! * `loom` feature: poison-as-panic wrapper around `loom::sync`, plus //! a local `Arc` / `Weak` shim (loom 0.7 ships `Arc` but no //! `Weak`). -//! * `shuttle` / `shuttle_pct` / `shuttle_dfs` features: poison-as-panic -//! wrapper around `shuttle::sync`. All three flavours share one -//! wrapper module; the scheduler difference is runtime-only (see -//! `concurrency::stress`). +//! * `shuttle` feature (plus the additive `shuttle_dfs` opt-in): +//! poison-as-panic wrapper around `shuttle::sync`. The scheduler +//! choice is runtime-only -- `concurrency::stress` builds a +//! `shuttle::PortfolioRunner` that runs Random and PCT in parallel, +//! and adds DFS to the portfolio when `shuttle_dfs` is on. //! * `parking_lot` feature (default): zero-cost re-export of //! `parking_lot`'s naked-guard locks; the production hot path. //! Skipped when `_strict_provenance` is on, even if `parking_lot` diff --git a/concurrency/src/sync/parking_lot_backend.rs b/concurrency/src/sync/parking_lot_backend.rs index 81b8b15421..2452e805cd 100644 --- a/concurrency/src/sync/parking_lot_backend.rs +++ b/concurrency/src/sync/parking_lot_backend.rs @@ -16,7 +16,8 @@ pub use parking_lot::{ Mutex, MutexGuard, RwLock, RwLockReadGuard, RwLockUpgradableReadGuard, RwLockWriteGuard, }; +#[allow(clippy::disallowed_types)] pub use std::sync::{ - Arc, Barrier, BarrierWaitResult, Condvar, LockResult, Once, OnceLock, OnceState, PoisonError, - TryLockError, TryLockResult, WaitTimeoutResult, Weak, atomic, mpsc, + Arc, Barrier, BarrierWaitResult, Condvar, LazyLock, LockResult, Once, OnceLock, OnceState, + PoisonError, TryLockError, TryLockResult, WaitTimeoutResult, Weak, atomic, mpsc, }; diff --git a/concurrency/src/sync/shuttle_backend.rs b/concurrency/src/sync/shuttle_backend.rs index 9a710edd48..6b7fd4cfb7 100644 --- a/concurrency/src/sync/shuttle_backend.rs +++ b/concurrency/src/sync/shuttle_backend.rs @@ -25,7 +25,8 @@ pub use shuttle::sync::{ Arc, Barrier, BarrierWaitResult, Condvar, LockResult, Once, OnceState, PoisonError, TryLockError, TryLockResult, WaitTimeoutResult, Weak, atomic, mpsc, }; -pub use std::sync::OnceLock; +#[allow(clippy::disallowed_types)] +pub use std::sync::{LazyLock, OnceLock}; #[inline(never)] #[cold] diff --git a/concurrency/src/sync/std_backend.rs b/concurrency/src/sync/std_backend.rs index 4313a87f50..b6b3c2ca19 100644 --- a/concurrency/src/sync/std_backend.rs +++ b/concurrency/src/sync/std_backend.rs @@ -15,14 +15,15 @@ // Wrapping below panics on poison. clippy::panic is denied at the // crate root; allow it locally for the cold poisoned() helper. #![allow(clippy::panic)] +#![allow(clippy::disallowed_types)] use core::fmt; use core::ops::{Deref, DerefMut}; use std::sync as inner; pub use std::sync::{ - Arc, Barrier, BarrierWaitResult, Condvar, LockResult, Once, OnceLock, OnceState, PoisonError, - TryLockError, TryLockResult, WaitTimeoutResult, Weak, atomic, mpsc, + Arc, Barrier, BarrierWaitResult, Condvar, LazyLock, LockResult, Once, OnceLock, OnceState, + PoisonError, TryLockError, TryLockResult, WaitTimeoutResult, Weak, atomic, mpsc, }; #[inline(never)] diff --git a/concurrency/src/thread/loom_scope.rs b/concurrency/src/thread/loom_scope.rs index 369ac1d48b..2e42a8703f 100644 --- a/concurrency/src/thread/loom_scope.rs +++ b/concurrency/src/thread/loom_scope.rs @@ -88,7 +88,7 @@ use core::marker::PhantomData; use core::panic::AssertUnwindSafe; use loom::sync::Arc; -use loom::thread::{self, JoinHandle}; +use loom::thread::JoinHandle; use std::panic::{catch_unwind, resume_unwind}; use crate::sync::Mutex; @@ -370,7 +370,7 @@ impl<'scope> Scope<'scope, '_> { >(Box::new(wrapped)) }; - let join_handle = thread::spawn(wrapped); + let join_handle = super::spawn(wrapped); // Shared handle slot: `scope()` and the user's // `ScopedJoinHandle` both hold an `Arc` clone of the same diff --git a/concurrency/src/thread/mod.rs b/concurrency/src/thread/mod.rs index d468568de9..105ede4e79 100644 --- a/concurrency/src/thread/mod.rs +++ b/concurrency/src/thread/mod.rs @@ -31,7 +31,35 @@ pub use std::thread::*; pub use shuttle::thread::*; #[cfg(all(feature = "loom", not(feature = "silence_clippy")))] -pub use loom::thread::*; +pub use loom::thread::{ + AccessError, Builder, JoinHandle, LocalKey, Thread, ThreadId, current, panicking, park, + yield_now, +}; + +/// Spawn a loom thread +/// +/// # Panics +/// Panics if loom's scheduler refuses the spawn (the underlying +/// `Builder::spawn` returns `io::Result`; for loom this is effectively +/// infallible, so the `expect` is a stand-in for `unwrap_unchecked`). +#[cfg(all(feature = "loom", not(feature = "silence_clippy")))] +#[allow(clippy::expect_used)] +pub fn spawn(f: F) -> JoinHandle +where + F: FnOnce() -> T + Send + 'static, + T: Send + 'static, +{ + Builder::new() + .stack_size(4 * 1024 * 1024) + .spawn(f) + .expect("loom thread spawn") +} + +/// Backend-portable `sleep` shim under loom. +#[cfg(all(feature = "loom", not(feature = "silence_clippy")))] +pub fn sleep(_: core::time::Duration) { + loom::thread::yield_now(); +} #[cfg(all(feature = "loom", not(feature = "silence_clippy")))] mod loom_scope; @@ -45,3 +73,75 @@ pub use loom_scope::{Scope, ScopedJoinHandle, scope}; // type-checks; it is never executed in that configuration. #[cfg(all(feature = "shuttle", feature = "loom", feature = "silence_clippy"))] pub use std::thread::*; + +// `Builder::spawn_scoped` is only inherent on `std::thread::Builder`; +// loom/shuttle provide `Scope::spawn` instead. This trait closes the gap. +pub trait BuilderExt { + /// Spawn a thread within `scope`. + /// + /// # Errors + /// Returns the underlying [`std::io::Error`] if the backend fails to + /// spawn the thread. Under loom and shuttle this is always `Ok`. + fn spawn_scoped<'scope, 'env, F, T>( + self, + scope: &'scope Scope<'scope, 'env>, + f: F, + ) -> std::io::Result> + where + F: FnOnce() -> T + Send + 'scope, + T: Send + 'scope; +} + +#[cfg(not(any(feature = "loom", feature = "shuttle")))] +impl BuilderExt for Builder { + fn spawn_scoped<'scope, 'env, F, T>( + self, + scope: &'scope Scope<'scope, 'env>, + f: F, + ) -> std::io::Result> + where + F: FnOnce() -> T + Send + 'scope, + T: Send + 'scope, + { + // Fully-qualified path to avoid recursing into the trait impl. + std::thread::Builder::spawn_scoped(self, scope, f) + } +} + +#[cfg(all( + feature = "shuttle", + not(feature = "loom"), + not(feature = "silence_clippy") +))] +impl BuilderExt for Builder { + fn spawn_scoped<'scope, 'env, F, T>( + self, + scope: &'scope Scope<'scope, 'env>, + f: F, + ) -> std::io::Result> + where + F: FnOnce() -> T + Send + 'scope, + T: Send + 'scope, + { + // Discard advisory Builder config; shuttle doesn't model OS + // thread name or stack size. + let _ = self; + Ok(scope.spawn(f)) + } +} + +#[cfg(all(feature = "loom", not(feature = "silence_clippy")))] +impl BuilderExt for Builder { + fn spawn_scoped<'scope, 'env, F, T>( + self, + scope: &'scope Scope<'scope, 'env>, + f: F, + ) -> std::io::Result> + where + F: FnOnce() -> T + Send + 'scope, + T: Send + 'scope, + { + let _ = self; + Ok(scope.spawn(f)) + } +} diff --git a/concurrency/tests/arc_weak.rs b/concurrency/tests/arc_weak.rs index 6fd38fc3ab..a7a49a64d3 100644 --- a/concurrency/tests/arc_weak.rs +++ b/concurrency/tests/arc_weak.rs @@ -1,6 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // Copyright Open Network Fabric Authors +#![allow(clippy::disallowed_types)] + //! Direct coverage for the `concurrency::sync::Arc` wrapper and //! `Weak` shim. //! @@ -26,15 +28,15 @@ //! tests gated to `concurrency = "loom"` to avoid asserting on real //! `std::sync` / `shuttle::sync` semantics. //! -//! `shuttle_pct` is opted out at file level: PCT is for biasing toward -//! rare interleavings of concurrent code, but most of the tests in this -//! file are protocol-level checks on `Arc` / `Weak` and either run on a -//! single thread or only briefly spawn a helper. PCT panics on bodies -//! that do not exercise sustained concurrency on the main thread, and -//! the contract being tested here is identical to what the plain -//! `shuttle` (random) variant already covers. - -#![cfg(not(feature = "shuttle_pct"))] +//! `shuttle` is opted out at file level: the portfolio runs PCT +//! alongside Random, and PCT panics on bodies that do not exercise +//! sustained concurrency on the main thread. Most of the tests in +//! this file are protocol-level checks on `Arc` / `Weak` that either +//! run on a single thread or only briefly spawn a helper, so they +//! cannot satisfy PCT. The contract being tested here is identical +//! to what loom already covers. + +#![cfg(not(feature = "shuttle"))] // `#[concurrency::test]` is provided by `dataplane-concurrency`; alias // the crate so the macro path resolves inside this integration test. diff --git a/concurrency/tests/quiescent_model.rs b/concurrency/tests/quiescent_model.rs index 2ad62e556d..4b022fd21d 100644 --- a/concurrency/tests/quiescent_model.rs +++ b/concurrency/tests/quiescent_model.rs @@ -1,6 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // Copyright Open Network Fabric Authors +#![allow(clippy::disallowed_types)] + //! Model-checking tests for `dataplane_concurrency::quiescent`. //! //! Each test is marked `#[concurrency::test]`, which routes the body to @@ -8,8 +10,8 @@ //! //! * default -- runs the body once directly (smoke test) //! * `loom` -- exhaustive interleaving exploration via `loom::model` -//! * `shuttle` / `shuttle_pct` / `shuttle_dfs` -- randomized / PCT / -//! DFS schedule exploration +//! * `shuttle` -- portfolio runs Random + PCT in parallel for each test; +//! the additive `shuttle_dfs` feature adds DFS to the same portfolio //! //! Run under loom (the headline use case) with: //! @@ -81,10 +83,11 @@ fn subscriber_drop_during_publish_is_safe() { /// published value, not the initial. This pins down the /// publish-then-snapshot ordering. /// -/// Skipped under `shuttle_pct`: this test is single-threaded by design -/// and PCT specifically panics on closures that don't exercise -/// concurrency. The other backends accept it. -#[cfg(not(feature = "shuttle_pct"))] +/// Skipped under `shuttle`: this test is single-threaded by design, +/// and the shuttle portfolio always runs PCT, which panics on closures +/// that don't exercise concurrency. The default and loom backends +/// accept it. +#[cfg(not(feature = "shuttle"))] #[concurrency::test] fn snapshot_after_publish_observes_published() { let publisher = channel(0u32); diff --git a/concurrency/tests/quiescent_properties.rs b/concurrency/tests/quiescent_properties.rs index a862822e09..fd2cf8576e 100644 --- a/concurrency/tests/quiescent_properties.rs +++ b/concurrency/tests/quiescent_properties.rs @@ -1,6 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // Copyright Open Network Fabric Authors +#![allow(clippy::disallowed_types)] + //! Property-based protocol tests for `dataplane_concurrency::quiescent`. //! //! Generates random sequences of [`Op`]s and checks the diff --git a/concurrency/tests/quiescent_protocol.rs b/concurrency/tests/quiescent_protocol.rs index 2abb01469f..b2c0a509ff 100644 --- a/concurrency/tests/quiescent_protocol.rs +++ b/concurrency/tests/quiescent_protocol.rs @@ -1,6 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // Copyright Open Network Fabric Authors +#![allow(clippy::disallowed_types)] + //! Multi-threaded protocol tests for `dataplane_concurrency::quiescent`. //! //! The single-threaded protocol invariants (snapshot legality, diff --git a/concurrency/tests/quiescent_shuttle.rs b/concurrency/tests/quiescent_shuttle.rs index f4ff609e2d..3e7a51f917 100644 --- a/concurrency/tests/quiescent_shuttle.rs +++ b/concurrency/tests/quiescent_shuttle.rs @@ -1,6 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // Copyright Open Network Fabric Authors +#![allow(clippy::disallowed_types)] + //! Bolero x shuttle property tests. //! //! Generates a [`Plan`] (Publisher ops + Subscriber ops, dispatched to diff --git a/concurrency/tests/scope_property.rs b/concurrency/tests/scope_property.rs index a13a6a3e53..9e323e3808 100644 --- a/concurrency/tests/scope_property.rs +++ b/concurrency/tests/scope_property.rs @@ -1,6 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // Copyright Open Network Fabric Authors +#![allow(clippy::disallowed_types)] + //! Bolero property test for `thread::scope`. //! //! Generates a [`Plan`] (a small number of spawned threads, each with a diff --git a/concurrency/tests/stress_dispatch.rs b/concurrency/tests/stress_dispatch.rs index aebda65798..b87deb9eaa 100644 --- a/concurrency/tests/stress_dispatch.rs +++ b/concurrency/tests/stress_dispatch.rs @@ -1,37 +1,28 @@ // SPDX-License-Identifier: Apache-2.0 // Copyright Open Network Fabric Authors +#![allow(clippy::disallowed_types)] + //! Tests for `concurrency::stress` backend dispatch. //! //! `stress(body)` is the small router that `#[concurrency::test]` -//! expands to: it picks one of `loom::model`, -//! `shuttle::check_random` / `_pct` / `_dfs`, or direct `body()` based -//! on the active backend's feature. The dispatch table lives in -//! `concurrency/src/stress.rs`. -//! -//! This file pins two coarse but important properties: -//! -//! 1. On the default backend, `stress` invokes `body` exactly once. -//! There is no scheduling exploration; the call should round-trip -//! untouched. +//! expands to: it picks one of `loom::model`, the shuttle +//! `PortfolioRunner` (Random + PCT, optionally DFS), or direct +//! `body()` based on the active backend's feature. The dispatch +//! table lives in `concurrency/src/stress.rs`. //! -//! 2. On `loom` or `shuttle` (random scheduler), `stress` invokes -//! `body` more than once -- the backend explores multiple -//! schedules / interleavings. Exact counts depend on the backend's -//! internal iteration budget and can change; the test only asserts -//! the contract that exploration actually happens. +//! This file pins one coarse but important property: on the default +//! backend, `stress` invokes `body` exactly once. There is no +//! scheduling exploration; the call should round-trip untouched. //! -//! PCT and DFS are skipped: PCT panics on test bodies that do no -//! concurrent work *on the main thread*, and DFS returns after a -//! single iteration in the schedule we hand it. Both are valid -//! shuttle schedulers but stricter than `check_random`; the dispatch -//! contract is the same for all three, so verifying it under -//! `shuttle` + `loom` is enough. +//! The corresponding "more-than-once" check for the model-checker +//! backends is skipped: PCT panics on bodies that do no concurrent +//! work *on the main thread* (and the portfolio always runs PCT +//! under `shuttle`), so the obvious `INVOCATIONS` counter shape can't +//! cohabit with the portfolio. The dispatch contract is identical +//! for loom (where it is exercised) and shuttle. -// With the `shuttle_dfs -> shuttle_pct -> shuttle` chain in -// `Cargo.toml`, `not(feature = "shuttle_pct")` is true exactly when -// neither PCT nor DFS is selected. -#![cfg(not(feature = "shuttle_pct"))] +#![cfg(not(feature = "shuttle"))] extern crate dataplane_concurrency as concurrency; @@ -100,14 +91,13 @@ fn model_check_backend_invokes_body_more_than_once() { // emitted attributes (or swallows them) breaks here loudly instead // of silently turning real test signals into no-ops. -#[cfg(not(feature = "shuttle_pct"))] #[concurrency::test] #[should_panic(expected = "intentional")] fn should_panic_attribute_attaches() { panic!("intentional"); } -#[cfg(not(any(feature = "loom", feature = "shuttle_pct")))] +#[cfg(not(feature = "loom"))] #[concurrency::test] #[ignore = "verifies #[ignore] threads through; not run by default"] fn ignore_attribute_attaches() { diff --git a/concurrency/tests/thread_scope.rs b/concurrency/tests/thread_scope.rs index 6586b3fcdc..4257e5b23a 100644 --- a/concurrency/tests/thread_scope.rs +++ b/concurrency/tests/thread_scope.rs @@ -1,6 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // Copyright Open Network Fabric Authors +#![allow(clippy::disallowed_types)] + //! Direct coverage for `concurrency::thread::scope` -- the loom shim //! in particular, but the tests pass under every backend. //! @@ -35,13 +37,14 @@ use concurrency::thread; // Several tests below have the spawn-and-wait shape ("main spawns, // joins via the implicit auto-join, reads only after scope returns"), // which PCT counts as "the main thread did no concurrent work" and -// panics on. Same approach `quiescent_model.rs` takes for its -// single-threaded `snapshot_after_publish_observes_published` test. -// Tests with two or more spawns issuing atomic ops (e.g. -// `multiple_spawns_all_join_before_return`) are PCT-compatible. +// panics on. The shuttle portfolio always runs PCT, so those tests +// are skipped under `feature = "shuttle"` entirely. Tests with two +// or more spawns issuing atomic ops (e.g. +// `multiple_spawns_all_join_before_return`) are PCT-compatible and +// stay enabled under shuttle. /// `scope()` returns the body's value. -#[cfg(not(feature = "shuttle_pct"))] +#[cfg(not(feature = "shuttle"))] #[concurrency::test] fn scope_returns_body_value() { let v = thread::scope(|_| 42u32); @@ -50,7 +53,7 @@ fn scope_returns_body_value() { /// A single spawned thread is joined before `scope()` returns; the /// `AtomicUsize` it wrote is visible to the caller (Acquire on join). -#[cfg(not(feature = "shuttle_pct"))] +#[cfg(not(feature = "shuttle"))] #[concurrency::test] fn single_spawn_joins_before_return() { let counter = Arc::new(AtomicUsize::new(0)); @@ -81,7 +84,7 @@ fn multiple_spawns_all_join_before_return() { } /// `ScopedJoinHandle::join` returns the spawned thread's value. -#[cfg(not(feature = "shuttle_pct"))] +#[cfg(not(feature = "shuttle"))] #[concurrency::test] fn explicit_join_returns_value() { thread::scope(|s| { @@ -94,7 +97,7 @@ fn explicit_join_returns_value() { /// Spawned closures may borrow data of any lifetime that outlives the /// scope -- the headline `std::thread::scope` guarantee. Under loom /// this is the shim's `mem::transmute` doing its job. -#[cfg(not(feature = "shuttle_pct"))] +#[cfg(not(feature = "shuttle"))] #[concurrency::test] fn spawn_can_borrow_from_enclosing_scope() { let counter = Arc::new(AtomicUsize::new(0)); @@ -178,7 +181,7 @@ fn nested_scoped_spawn_is_joined() { /// latest) when the spawned thread is joined -- i.e. before `scope()` /// returns. Pinned via an `AtomicUsize` incremented from within the /// payload's `Drop` impl. -#[cfg(not(feature = "shuttle_pct"))] +#[cfg(not(feature = "shuttle"))] #[concurrency::test] fn moved_value_drop_runs_before_scope_returns() { struct Bump(Arc); diff --git a/config/Cargo.toml b/config/Cargo.toml index ef91ac0f5a..4a64689ba8 100644 --- a/config/Cargo.toml +++ b/config/Cargo.toml @@ -13,6 +13,7 @@ testing = [] [dependencies] # internal common = { workspace = true } +concurrency = { workspace = true } k8s-intf = { workspace = true } net = { workspace = true } lpm = { workspace = true } diff --git a/config/src/external/overlay/vpcpeering.rs b/config/src/external/overlay/vpcpeering.rs index 1bfd168450..6edf88d832 100644 --- a/config/src/external/overlay/vpcpeering.rs +++ b/config/src/external/overlay/vpcpeering.rs @@ -7,6 +7,7 @@ use crate::utils::{ check_private_prefixes_dont_overlap, check_public_prefixes_dont_overlap, collapse_prefixes, merge_contiguous_prefixes, merge_overlapping_prefixes, }; +use concurrency::sync::LazyLock; use lpm::prefix::{IpRangeWithPorts, L4Protocol, Prefix, PrefixPortsSet, PrefixWithOptionalPorts}; use std::collections::BTreeMap; use std::time::Duration; @@ -68,8 +69,7 @@ impl VpcExposeNat { } fn empty_set() -> &'static PrefixPortsSet { - static EMPTY_SET: std::sync::LazyLock = - std::sync::LazyLock::new(PrefixPortsSet::new); + static EMPTY_SET: LazyLock = LazyLock::new(PrefixPortsSet::new); &EMPTY_SET } diff --git a/config/src/gwconfig.rs b/config/src/gwconfig.rs index 2c1ad47aee..8eab39dc9c 100644 --- a/config/src/gwconfig.rs +++ b/config/src/gwconfig.rs @@ -6,8 +6,8 @@ use crate::errors::{ConfigError, ConfigResult}; use crate::external::{ExternalConfig, GenId, ValidatedExternalConfig}; use crate::internal::InternalConfig; -use arc_swap::ArcSwap; -use std::sync::Arc; +use concurrency::slot::Slot; +use concurrency::sync::Arc; use std::time::SystemTime; use tracing::debug; @@ -64,7 +64,7 @@ impl GwConfigMeta { #[derive(Debug)] pub struct GwConfig { /// Configuration metadata - pub meta: ArcSwap, + pub meta: Slot, /// Configuration, as received pub external: ExternalConfig, @@ -80,7 +80,7 @@ impl GwConfig { #[must_use] pub fn new(external: ExternalConfig) -> Self { Self { - meta: ArcSwap::new(Arc::from(GwConfigMeta::new(external.genid))), + meta: Slot::new(Arc::from(GwConfigMeta::new(external.genid))), external, internal: None, } @@ -146,7 +146,7 @@ impl GwConfig { #[derive(Debug)] pub struct ValidatedGwConfig { - meta: ArcSwap, + meta: Slot, external: ValidatedExternalConfig, internal: Option, } @@ -158,14 +158,14 @@ impl ValidatedGwConfig { // A unit test verifies this invariant. let external = ValidatedExternalConfig::blank(); Self { - meta: ArcSwap::new(Arc::from(GwConfigMeta::new(external.genid()))), + meta: Slot::new(Arc::from(GwConfigMeta::new(external.genid()))), external, internal: None, } } #[must_use] - pub fn meta(&self) -> &ArcSwap { + pub fn meta(&self) -> &Slot { &self.meta } diff --git a/config/src/utils/collapse.rs b/config/src/utils/collapse.rs index 166257b7ca..af34c2465d 100644 --- a/config/src/utils/collapse.rs +++ b/config/src/utils/collapse.rs @@ -549,7 +549,7 @@ mod tests { #[test] fn test_bolero_collapse_prefix_lists() { let generator = cfg_select! { - miri => { + emulated => { PrefixExcludeAddrsGenerator { prefix_max: 10, exclude_max: 10, diff --git a/dataplane/Cargo.toml b/dataplane/Cargo.toml index 3e9b86e3c0..3deac747ca 100644 --- a/dataplane/Cargo.toml +++ b/dataplane/Cargo.toml @@ -8,6 +8,9 @@ version.workspace = true [features] default = ["dpdk"] dpdk = ["dep:dpdk", "dep:dpdk-sysroot-helper"] +# Bin is gated out under loom (see `src/main.rs`); the feature exists so +# `--features loom` resolves at workspace level and propagates to libs. +loom = ["concurrency/loom"] [dependencies] afpacket = { workspace = true, features = ["async-tokio"] } diff --git a/dataplane/src/drivers/dpdk.rs b/dataplane/src/drivers/dpdk.rs index 0f19032206..6aff203e82 100644 --- a/dataplane/src/drivers/dpdk.rs +++ b/dataplane/src/drivers/dpdk.rs @@ -15,7 +15,7 @@ use dpdk::queue::tx::{TxQueueConfig, TxQueueIndex}; use dpdk::{dev, eal, socket}; use tracing::{debug, error, info, trace, warn}; -use crate::CmdArgs; +use args::CmdArgs; use net::buffer::PacketBufferMut; use net::packet::Packet; use pipeline::sample_nfs::Passthrough; diff --git a/dataplane/src/main.rs b/dataplane/src/main.rs index bac850efe7..f5786d0258 100644 --- a/dataplane/src/main.rs +++ b/dataplane/src/main.rs @@ -5,292 +5,25 @@ #![deny(rustdoc::all)] #![allow(rustdoc::missing_crate_level_docs)] -mod drivers; -mod packet_processor; -mod statistics; - -use crate::packet_processor::start_router; -use crate::statistics::MetricsServer; -use args::{CmdArgs, Parser}; - -use drivers::kernel::DriverKernel; -use mgmt::{ConfigProcessorParams, MgmtParams, start_mgmt}; - -use nix::unistd::gethostname; -use pyroscope::backend::{BackendConfig, PprofConfig, pprof_backend}; -use pyroscope::pyroscope::{PyroscopeAgentBuilder, PyroscopeConfig}; - -use routing::{BmpServerParams, RouterParamsBuilder}; -use tracectl::{custom_target, get_trace_ctl, trace_target}; - -use tracing::{error, info, level_filters::LevelFilter}; - -use concurrency::sync::Arc; -use config::internal::routing::bmp::BmpOptions; -use config::internal::status::DataplaneStatus; -use net::tcp::TcpPort; -use std::time::Duration; -use tokio::sync::RwLock; - -trace_target!("dataplane", LevelFilter::DEBUG, &[]); -custom_target!("Pyroscope", LevelFilter::INFO, &["third-party"]); -custom_target!("kube", LevelFilter::WARN, &["third-party"]); -custom_target!("hyper", LevelFilter::WARN, &["third-party"]); -custom_target!("tower", LevelFilter::WARN, &["third-party"]); - -const PYROSCOPE_APP_NAME: &str = "hedgehog-dataplane"; - -fn init_name(args: &CmdArgs) -> Result { - if let Some(name) = args.get_name() { - Ok(name.clone()) - } else { - let hostname = - gethostname().map_err(|errno| format!("Failed to get hostname: {}", errno.desc()))?; - let name = hostname - .to_str() - .ok_or_else(|| format!("Failed to convert hostname {}", hostname.display()))?; - Ok(name.to_string()) - } -} -fn init_logging(gwname: &str) { - let tctl = get_trace_ctl(); - info!( - " ━━━━━━ Starting dataplane for gateway '{gwname}' (Version = {}) ━━━━━━", - option_env!("VERSION").unwrap_or("dev").to_string() - ); - - tctl.set_default_level(LevelFilter::DEBUG) - .expect("Setting default loglevel failed"); -} - -fn process_tracing_cmds(args: &CmdArgs) { - if let Some(tracing) = args.tracing() - && let Err(e) = get_trace_ctl().setup_from_string(tracing) - { - error!("Invalid tracing configuration: {e}"); - panic!("Invalid tracing configuration: {e}"); - } - if args.show_tracing_tags() { - let out = get_trace_ctl() - .as_string_by_tag() - .unwrap_or_else(|e| e.to_string()); - println!("{out}"); - std::process::exit(0); - } - if args.show_tracing_targets() { - let out = get_trace_ctl() - .as_string() - .unwrap_or_else(|e| e.to_string()); - println!("{out}"); - std::process::exit(0); - } - if args.tracing_config_generate() { - let out = get_trace_ctl() - .as_config_string() - .unwrap_or_else(|e| e.to_string()); - println!("{out}"); - std::process::exit(0); - } -} - -fn parse_bmp_params(args: &CmdArgs) -> (Option, Option) { - if args.bmp_enabled() { - let bind = args.bmp_address(); - let interval: Duration = args.bmp_interval(); - - info!("BMP: enabled, listening on {bind}, interval={:?}", interval); - - // BMP server (for routing crate) - let server = BmpServerParams { - bind_addr: bind, - stats_interval: interval, - min_retry: None, - max_retry: None, - }; - - // BMP options for FRR (for internal config) - let host = bind.ip().to_string(); - let port = TcpPort::try_from(bind.port()).expect("Invalid BMP port"); - let client = BmpOptions::new("bmp1", host, port) - .set_retry(interval, interval.saturating_mul(4u32)) - .set_stats_interval(interval) - .monitor_ipv4(true, true); - - (Some(server), Some(client)) - } else { - info!("BMP: disabled"); - (None, None) - } +// Loom 0.7's `Arc` doesn't impl `CoerceUnsized`, so the +// `Arc::new(closure) as Arc` in `packet_processor` +// won't compile under `--features loom`. The bin isn't model-checked +// anyway, so gate it out and provide a stub `main` to keep cargo happy. +#[cfg(feature = "loom")] +fn main() { + panic!("the dataplane binary is not built under the loom backend"); } -#[allow(clippy::too_many_lines)] +#[cfg(not(feature = "loom"))] fn main() { - let args = CmdArgs::parse(); - let gwname = match init_name(&args) { - Ok(name) => name, - Err(e) => { - eprintln!("Failed to set gateway name: {e}"); - std::process::exit(1); - } - }; - init_logging(&gwname); - - let (bmp_server_params, bmp_client_opts) = parse_bmp_params(&args); - - let dp_status: Arc> = Arc::new(RwLock::new(DataplaneStatus::new())); - - let agent_running = args.pyroscope_url().and_then(|url| { - let pyroscope_config = PyroscopeConfig::default(); - let sample_rate = pyroscope_config.sample_rate; - - match PyroscopeAgentBuilder::new( - url.as_str(), - PYROSCOPE_APP_NAME, - sample_rate, - pyroscope_config.spy_name, - pyroscope_config.spy_version, - pprof_backend( - PprofConfig { sample_rate }, - BackendConfig { - report_thread_name: true, - ..BackendConfig::default() - }, - ), - ) - .build() - { - Ok(agent) => match agent.start() { - Ok(running) => Some(running), - Err(e) => { - error!("Pyroscope start failed: {e}"); - None - } - }, - Err(e) => { - error!("Pyroscope build failed: {e}"); - None - } - } - }); - - process_tracing_cmds(&args); - - let (stop_tx, stop_rx) = std::sync::mpsc::channel(); - let ctrlc_stop_tx = stop_tx.clone(); - ctrlc::set_handler(move || { - ctrlc_stop_tx - .send(0) - .expect("Error sending shutdown signal"); - }) - .expect("failed to set SIGINT handler"); - - /* router parameters */ - let mut binding = RouterParamsBuilder::default(); - let mut rp_builder = binding - .cli_sock_path(args.cli_sock_path()) - .cpi_sock_path(args.cpi_sock_path()) - .frr_agent_path(args.frr_agent_path()) - .dp_status(dp_status.clone()); - - // Only set BMP when it's enabled (strip_option setter expects the inner type) - if let Some(server) = bmp_server_params { - rp_builder = rp_builder.bmp(server); - } - - let Ok(router_params) = rp_builder.build() else { - error!("Bad router configuration"); - panic!("Bad router configuration"); - }; - - // start the router; returns control-plane handles and a pipeline factory - let setup = start_router(router_params).expect("failed to start router"); - - MetricsServer::new(args.metrics_address(), setup.stats); - - // pipeline builder - let pipeline_factory = setup.pipeline; - - /* start management: main thread will be blocked until ready or failure */ - if let Err(e) = start_mgmt(MgmtParams { - config_dir: args.config_dir().cloned(), - hostname: gwname.clone(), - processor_params: ConfigProcessorParams { - router_ctl: setup.router.get_ctl_tx(), - pipeline_data: pipeline_factory().get_data(), - flow_table: setup.flow_table, - vpcmapw: setup.vpcmapw, - nattablesw: setup.nattablesw, - natallocatorw: setup.natallocatorw, - flowfilterw: setup.flowfiltertablesw, - portfw_w: setup.portfw_w, - vpc_stats_store: setup.vpc_stats_store, - dp_status_r: dp_status.clone(), - bmp_options: bmp_client_opts, - }, - }) { - error!("Failed to start mgmt: {e}. Stopping dataplane..."); - std::process::exit(-1); - } - info!("Management is running now"); - - /* start driver with the provided pipeline builder */ - let e = match args.driver_name() { - "dpdk" => { - info!("Using driver DPDK..."); - todo!(); - } - "kernel" => { - info!("Using driver kernel..."); - DriverKernel::start( - stop_tx.clone(), - args.kernel_interfaces(), - args.kernel_num_workers(), - &pipeline_factory, - ) - } - other => { - error!("Unknown driver '{other}'. Aborting..."); - panic!("Packet processing pipeline failed to start. Aborting..."); - } - }; - - if let Err(e) = e { - error!("Failed to start driver: {e}"); - std::process::exit(-1); - } - - let exit_code = stop_rx.recv().expect("failed to receive stop signal"); - info!("Shutting down dataplane"); - if let Some(running) = agent_running { - match running.stop() { - Ok(ready) => ready.shutdown(), - Err(e) => error!("Pyroscope stop failed: {e}"), - } - } - std::process::exit(exit_code); + runtime::main(); } -#[cfg(false)] // disabled until dpdk-sys refactor is complete -#[cfg(test)] -mod test { - use n_vm::in_vm; - - #[test] - #[in_vm] - fn root_filesystem_in_vm_is_read_only() { - let error = std::fs::File::create_new("/some.file").unwrap_err(); - assert_eq!(error.kind(), std::io::ErrorKind::ReadOnlyFilesystem); - } - - #[test] - #[in_vm] - fn run_filesystem_in_vm_is_read_write() { - std::fs::File::create_new("/run/some.file").unwrap(); - } - - #[test] - #[in_vm] - fn tmp_filesystem_in_vm_is_read_write() { - std::fs::File::create_new("/tmp/some.file").unwrap(); - } -} +#[cfg(not(feature = "loom"))] +mod drivers; +#[cfg(not(feature = "loom"))] +mod packet_processor; +#[cfg(not(feature = "loom"))] +mod runtime; +#[cfg(not(feature = "loom"))] +mod statistics; diff --git a/dataplane/src/runtime.rs b/dataplane/src/runtime.rs new file mode 100644 index 0000000000..d75855eadb --- /dev/null +++ b/dataplane/src/runtime.rs @@ -0,0 +1,288 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright Open Network Fabric Authors + +use crate::packet_processor::start_router; +use crate::statistics::MetricsServer; +use args::{CmdArgs, Parser}; + +use crate::drivers::kernel::DriverKernel; +use mgmt::{ConfigProcessorParams, MgmtParams, start_mgmt}; + +use nix::unistd::gethostname; +use pyroscope::backend::{BackendConfig, PprofConfig, pprof_backend}; +use pyroscope::pyroscope::{PyroscopeAgentBuilder, PyroscopeConfig}; + +use routing::{BmpServerParams, RouterParamsBuilder}; +use tracectl::{custom_target, get_trace_ctl, trace_target}; + +use tracing::{error, info, level_filters::LevelFilter}; + +use concurrency::sync::Arc; +use config::internal::routing::bmp::BmpOptions; +use config::internal::status::DataplaneStatus; +use net::tcp::TcpPort; +use std::time::Duration; +use tokio::sync::RwLock; + +trace_target!("dataplane", LevelFilter::DEBUG, &[]); +custom_target!("Pyroscope", LevelFilter::INFO, &["third-party"]); +custom_target!("kube", LevelFilter::WARN, &["third-party"]); +custom_target!("hyper", LevelFilter::WARN, &["third-party"]); +custom_target!("tower", LevelFilter::WARN, &["third-party"]); + +const PYROSCOPE_APP_NAME: &str = "hedgehog-dataplane"; + +fn init_name(args: &CmdArgs) -> Result { + if let Some(name) = args.get_name() { + Ok(name.clone()) + } else { + let hostname = + gethostname().map_err(|errno| format!("Failed to get hostname: {}", errno.desc()))?; + let name = hostname + .to_str() + .ok_or_else(|| format!("Failed to convert hostname {}", hostname.display()))?; + Ok(name.to_string()) + } +} +fn init_logging(gwname: &str) { + let tctl = get_trace_ctl(); + info!( + " ━━━━━━ Starting dataplane for gateway '{gwname}' (Version = {}) ━━━━━━", + option_env!("VERSION").unwrap_or("dev").to_string() + ); + + tctl.set_default_level(LevelFilter::DEBUG) + .expect("Setting default loglevel failed"); +} + +fn process_tracing_cmds(args: &CmdArgs) { + if let Some(tracing) = args.tracing() + && let Err(e) = get_trace_ctl().setup_from_string(tracing) + { + error!("Invalid tracing configuration: {e}"); + panic!("Invalid tracing configuration: {e}"); + } + if args.show_tracing_tags() { + let out = get_trace_ctl() + .as_string_by_tag() + .unwrap_or_else(|e| e.to_string()); + println!("{out}"); + std::process::exit(0); + } + if args.show_tracing_targets() { + let out = get_trace_ctl() + .as_string() + .unwrap_or_else(|e| e.to_string()); + println!("{out}"); + std::process::exit(0); + } + if args.tracing_config_generate() { + let out = get_trace_ctl() + .as_config_string() + .unwrap_or_else(|e| e.to_string()); + println!("{out}"); + std::process::exit(0); + } +} + +fn parse_bmp_params(args: &CmdArgs) -> (Option, Option) { + if args.bmp_enabled() { + let bind = args.bmp_address(); + let interval: Duration = args.bmp_interval(); + + info!("BMP: enabled, listening on {bind}, interval={:?}", interval); + + // BMP server (for routing crate) + let server = BmpServerParams { + bind_addr: bind, + stats_interval: interval, + min_retry: None, + max_retry: None, + }; + + // BMP options for FRR (for internal config) + let host = bind.ip().to_string(); + let port = TcpPort::try_from(bind.port()).expect("Invalid BMP port"); + let client = BmpOptions::new("bmp1", host, port) + .set_retry(interval, interval.saturating_mul(4u32)) + .set_stats_interval(interval) + .monitor_ipv4(true, true); + + (Some(server), Some(client)) + } else { + info!("BMP: disabled"); + (None, None) + } +} + +#[allow(clippy::too_many_lines)] +pub fn main() { + let args = CmdArgs::parse(); + let gwname = match init_name(&args) { + Ok(name) => name, + Err(e) => { + eprintln!("Failed to set gateway name: {e}"); + std::process::exit(1); + } + }; + init_logging(&gwname); + + let (bmp_server_params, bmp_client_opts) = parse_bmp_params(&args); + + let dp_status: Arc> = Arc::new(RwLock::new(DataplaneStatus::new())); + + let agent_running = args.pyroscope_url().and_then(|url| { + let pyroscope_config = PyroscopeConfig::default(); + let sample_rate = pyroscope_config.sample_rate; + + match PyroscopeAgentBuilder::new( + url.as_str(), + PYROSCOPE_APP_NAME, + sample_rate, + pyroscope_config.spy_name, + pyroscope_config.spy_version, + pprof_backend( + PprofConfig { sample_rate }, + BackendConfig { + report_thread_name: true, + ..BackendConfig::default() + }, + ), + ) + .build() + { + Ok(agent) => match agent.start() { + Ok(running) => Some(running), + Err(e) => { + error!("Pyroscope start failed: {e}"); + None + } + }, + Err(e) => { + error!("Pyroscope build failed: {e}"); + None + } + } + }); + + process_tracing_cmds(&args); + + let (stop_tx, stop_rx) = std::sync::mpsc::channel(); + let ctrlc_stop_tx = stop_tx.clone(); + ctrlc::set_handler(move || { + ctrlc_stop_tx + .send(0) + .expect("Error sending shutdown signal"); + }) + .expect("failed to set SIGINT handler"); + + /* router parameters */ + let mut binding = RouterParamsBuilder::default(); + let mut rp_builder = binding + .cli_sock_path(args.cli_sock_path()) + .cpi_sock_path(args.cpi_sock_path()) + .frr_agent_path(args.frr_agent_path()) + .dp_status(dp_status.clone()); + + // Only set BMP when it's enabled (strip_option setter expects the inner type) + if let Some(server) = bmp_server_params { + rp_builder = rp_builder.bmp(server); + } + + let Ok(router_params) = rp_builder.build() else { + error!("Bad router configuration"); + panic!("Bad router configuration"); + }; + + // start the router; returns control-plane handles and a pipeline factory + let setup = start_router(router_params).expect("failed to start router"); + + MetricsServer::new(args.metrics_address(), setup.stats); + + // pipeline builder + let pipeline_factory = setup.pipeline; + + /* start management: main thread will be blocked until ready or failure */ + if let Err(e) = start_mgmt(MgmtParams { + config_dir: args.config_dir().cloned(), + hostname: gwname.clone(), + processor_params: ConfigProcessorParams { + router_ctl: setup.router.get_ctl_tx(), + pipeline_data: pipeline_factory().get_data(), + flow_table: setup.flow_table, + vpcmapw: setup.vpcmapw, + nattablesw: setup.nattablesw, + natallocatorw: setup.natallocatorw, + flowfilterw: setup.flowfiltertablesw, + portfw_w: setup.portfw_w, + vpc_stats_store: setup.vpc_stats_store, + dp_status_r: dp_status.clone(), + bmp_options: bmp_client_opts, + }, + }) { + error!("Failed to start mgmt: {e}. Stopping dataplane..."); + std::process::exit(-1); + } + info!("Management is running now"); + + /* start driver with the provided pipeline builder */ + let e = match args.driver_name() { + "dpdk" => { + info!("Using driver DPDK..."); + todo!(); + } + "kernel" => { + info!("Using driver kernel..."); + DriverKernel::start( + stop_tx.clone(), + args.kernel_interfaces(), + args.kernel_num_workers(), + &pipeline_factory, + ) + } + other => { + error!("Unknown driver '{other}'. Aborting..."); + panic!("Packet processing pipeline failed to start. Aborting..."); + } + }; + + if let Err(e) = e { + error!("Failed to start driver: {e}"); + std::process::exit(-1); + } + + let exit_code = stop_rx.recv().expect("failed to receive stop signal"); + info!("Shutting down dataplane"); + if let Some(running) = agent_running { + match running.stop() { + Ok(ready) => ready.shutdown(), + Err(e) => error!("Pyroscope stop failed: {e}"), + } + } + std::process::exit(exit_code); +} + +#[cfg(false)] // disabled until dpdk-sys refactor is complete +#[cfg(test)] +mod test { + use n_vm::in_vm; + + #[test] + #[in_vm] + fn root_filesystem_in_vm_is_read_only() { + let error = std::fs::File::create_new("/some.file").unwrap_err(); + assert_eq!(error.kind(), std::io::ErrorKind::ReadOnlyFilesystem); + } + + #[test] + #[in_vm] + fn run_filesystem_in_vm_is_read_write() { + std::fs::File::create_new("/run/some.file").unwrap(); + } + + #[test] + #[in_vm] + fn tmp_filesystem_in_vm_is_read_write() { + std::fs::File::create_new("/tmp/some.file").unwrap(); + } +} diff --git a/default.nix b/default.nix index 33030ca3bd..126ea0ba94 100644 --- a/default.nix +++ b/default.nix @@ -22,6 +22,7 @@ let else builtins.filter (elm: builtins.isString elm) (builtins.split split-on string); lib = (import sources.nixpkgs { }).lib; + host-arch = (import sources.nixpkgs { }).stdenv.hostPlatform.parsed.cpu.name; platform' = import ./nix/platforms.nix { inherit lib @@ -38,6 +39,7 @@ let instrumentation profile cargo-features + host-arch ; inherit (platform') arch; }; @@ -50,6 +52,7 @@ let instrumentation profile cargo-features + host-arch ; inherit (platform') arch; for-tests = true; @@ -172,6 +175,7 @@ let oras pkg-config python3Packages.pyflakes + qemu-user rust-toolchain shellcheck skopeo diff --git a/dpdk/src/acl/config.rs b/dpdk/src/acl/config.rs index b446c303dc..ddbc307016 100644 --- a/dpdk/src/acl/config.rs +++ b/dpdk/src/acl/config.rs @@ -1165,7 +1165,14 @@ mod tests { assert!(result.is_ok()); } + // TODO: under cross-aarch64, bindgen sees RESULTS_MULTIPLIER as 1 + // (vs 4 on x86_64), so the "non-multiple" branch is unreachable. + // Root cause is in our DPDK binding, not this validator. #[test] + #[cfg_attr( + target_arch = "aarch64", + ignore = "RESULTS_MULTIPLIER binds to 1 under cross-aarch64; see TODO" + )] fn misaligned_categories_rejected() { // 3 is > 1 but not a multiple of RESULTS_MULTIPLIER (4) let result = AclBuildConfig::new(3, sample_field_defs::<1>(), 0); diff --git a/dpdk/src/acl/mod.rs b/dpdk/src/acl/mod.rs index d920fe8eff..85b58f59e5 100644 --- a/dpdk/src/acl/mod.rs +++ b/dpdk/src/acl/mod.rs @@ -743,8 +743,8 @@ mod tests { /// to give the OS scheduler a chance to interleave. #[test] fn classify_concurrent_arc_shared() { - use std::sync::Arc; - use std::thread; + use concurrency::sync::Arc; + use concurrency::thread; let _eal = start_eal(); diff --git a/flow-entry/Cargo.toml b/flow-entry/Cargo.toml index 62670d8278..c0bd08e621 100644 --- a/flow-entry/Cargo.toml +++ b/flow-entry/Cargo.toml @@ -6,9 +6,9 @@ publish.workspace = true version.workspace = true [features] +loom = ["concurrency/loom"] shuttle = ["concurrency/shuttle"] -shuttle_pct = ["concurrency/shuttle_pct", "shuttle"] -shuttle_dfs = ["concurrency/shuttle_dfs", "shuttle_pct"] +shuttle_dfs = ["concurrency/shuttle_dfs", "shuttle"] std = [] testing = [] bolero = ["dep:bolero", "net/bolero"] diff --git a/flow-entry/src/flow_table/nf_lookup.rs b/flow-entry/src/flow_table/nf_lookup.rs index d0867241ff..7fb7395a6b 100644 --- a/flow-entry/src/flow_table/nf_lookup.rs +++ b/flow-entry/src/flow_table/nf_lookup.rs @@ -151,7 +151,7 @@ mod test { .add_stage(flowinfo_creator); const NUM_PACKETS: u16 = cfg_select! { - miri => 10, + emulated => 10, _ => 1000, }; diff --git a/flow-entry/src/flow_table/table.rs b/flow-entry/src/flow_table/table.rs index 1d59fb1f9b..79fbd6f2da 100644 --- a/flow-entry/src/flow_table/table.rs +++ b/flow-entry/src/flow_table/table.rs @@ -235,12 +235,11 @@ impl FlowTable { val.update_status(FlowStatus::Active); drop(table); - #[cfg(not(feature = "shuttle"))] + #[cfg(not(any(feature = "shuttle", feature = "loom")))] Self::start_timer(self.table.clone(), val.clone()); if let Some(old) = result.as_ref() { old.update_status(FlowStatus::Detached); - #[cfg(not(feature = "shuttle"))] old.token.cancel(); } @@ -267,7 +266,6 @@ impl FlowTable { let stale = status != FlowStatus::Active || v.expires_at() <= now; if stale { v.update_status(FlowStatus::Expired); - #[cfg(not(feature = "shuttle"))] v.token.cancel(); count += 1; } @@ -306,7 +304,6 @@ impl FlowTable { let result = table.remove(flow_key); if let Some((_key, flow_info)) = result.as_ref() { flow_info.update_status(FlowStatus::Detached); - #[cfg(not(feature = "shuttle"))] flow_info.token.cancel(); } result @@ -570,7 +567,7 @@ mod tests { } #[tokio::test] - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] // tokio::time::sleep counts wall-clock seconds, so a 4s sleep under miri's slow // interpreter elapses many real-world seconds and the "extended" flow's std::Instant // deadline gets passed too. Fixing this would require running on tokio's paused @@ -623,7 +620,7 @@ mod tests { } #[tokio::test] - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] /// Test that invalidating flows causes timer to expire and flows to be removed async fn test_flow_table_flow_reinsertion() { let flow_table = FlowTable::default(); @@ -694,57 +691,21 @@ mod tests { } } - #[concurrency_mode(shuttle)] - mod shuttle_tests { + // Shuttle-only: `FlowTable::insert` spawns a tokio task for the timer + // and would panic without a running runtime; the `start_timer` call is + // bypassed under shuttle. Loom is excluded because FlowTable's DashMap + // panics in loom's end-of-execution cleanup. + #[cfg(feature = "shuttle")] + mod concurrency_tests { use super::*; use crate::flow_table::FlowInfo; use concurrency::sync::Arc; use concurrency::thread; use std::time::Instant; - #[test] - fn test_flow_table_timeout() { - shuttle::check_random( - move || { - let now = Instant::now(); - let two_seconds = Duration::from_secs(2); - - let flow_table = FlowTable::default(); - let flow_key = FlowKey::Unidirectional(FlowKeyData::new( - Some(VpcDiscriminant::VNI(Vni::new_checked(42).unwrap())), - "10.0.0.1".parse::().unwrap(), - "10.0.0.2".parse::().unwrap(), - IpProtoKey::Tcp(TcpProtoKey { - src_port: TcpPort::new_checked(1234).unwrap(), - dst_port: TcpPort::new_checked(5678).unwrap(), - }), - )); - - let flow_info = FlowInfo::new(flow_key, now + two_seconds); - flow_table.insert(flow_info).unwrap(); - - // Flow is active; lookup should return Some. - assert!( - flow_table.lookup(&flow_key).is_some(), - "Flow key should be present" - ); - - // Simulate timer expiration by marking the flow directly. - if let Some(fi) = flow_table.lookup(&flow_key) { - fi.update_status(FlowStatus::Expired); - } - - // Lookup: will find it because we don't expire without tokio nor do lazy removals - let found = flow_table.lookup(&flow_key).unwrap(); - assert_eq!(found.status(), FlowStatus::Expired); - }, - 100, - ); - } - #[allow(clippy::too_many_lines)] - #[test] - #[tracing_test::traced_test] + #[concurrency::test] + #[cfg_attr(not(emulated), tracing_test::traced_test)] fn test_flow_table_concurrent_insert_remove_lookup_expire() { const N: usize = 3; @@ -765,164 +726,154 @@ mod tests { }) .collect(); - shuttle::check_random( - move || { - let flow_table = Arc::new(FlowTable::default()); - - let now = Instant::now(); - - // Insert the first flow - let orig_flow_info = FlowInfo::new(flow_keys[0], now + two_seconds); - flow_table.insert(orig_flow_info).unwrap(); - let flow_info = flow_table.lookup(&flow_keys[0]).unwrap(); - - // This holder will retain the Arc until the inserter thread starts - let mut flow_info_holder = Some(flow_info); - - let mut handles = vec![]; - - // "expirer" thread — simulates what the tokio timer would do. - handles.push( - thread::Builder::new() - .name("expirer".to_string()) - .spawn({ - let flow_table = flow_table.clone(); - let flow_key = flow_keys[0]; - move || { - for _ in 0..N { - thread::yield_now(); - if let Some(fi) = flow_table.lookup(&flow_key) { - fi.update_status(FlowStatus::Expired); - } - } - } - }) - .unwrap(), - ); - - handles.push( - thread::Builder::new() - .name("inserter".to_string()) - .spawn({ - let flow_table = flow_table.clone(); - let flow_info = flow_info_holder.take(); - move || { - for _ in 0..N { - if let Some(flow_info) = flow_info.as_ref() { - flow_table.insert_from_arc(flow_info).unwrap(); - } - thread::yield_now(); - } - } - }) - .unwrap(), - ); - - handles.push( - thread::Builder::new() - .name("remover".to_string()) - .spawn({ - let flow_table = flow_table.clone(); - let flow_key = flow_keys[1]; - move || { - for _ in 0..N { - thread::yield_now(); - flow_table.remove(&flow_key); - } + let flow_table = Arc::new(FlowTable::default()); + + let now = Instant::now(); + + // Insert the first flow + let orig_flow_info = FlowInfo::new(flow_keys[0], now + two_seconds); + flow_table.insert(orig_flow_info).unwrap(); + let flow_info = flow_table.lookup(&flow_keys[0]).unwrap(); + + // This holder will retain the Arc until the inserter thread starts + let mut flow_info_holder = Some(flow_info); + + let mut handles = vec![]; + + // "expirer" thread — simulates what the tokio timer would do. + handles.push( + thread::Builder::new() + .name("expirer".to_string()) + .spawn({ + let flow_table = flow_table.clone(); + let flow_key = flow_keys[0]; + move || { + for _ in 0..N { + thread::yield_now(); + if let Some(fi) = flow_table.lookup(&flow_key) { + fi.update_status(FlowStatus::Expired); } - }) - .unwrap(), - ); - - handles.push( - thread::Builder::new() - .name("lookup_and_lock".to_string()) - .spawn({ - let flow_table = flow_table.clone(); - let flow_key = flow_keys[1]; - move || { - for _ in 0..N { - thread::yield_now(); - if let Some(flow_info) = flow_table.lookup(&flow_key) { - let _guard = flow_info.locked.write(); - } - } + } + } + }) + .unwrap(), + ); + + handles.push( + thread::Builder::new() + .name("inserter".to_string()) + .spawn({ + let flow_table = flow_table.clone(); + let flow_info = flow_info_holder.take(); + move || { + for _ in 0..N { + if let Some(flow_info) = flow_info.as_ref() { + flow_table.insert_from_arc(flow_info).unwrap(); } - }) - .unwrap(), - ); + thread::yield_now(); + } + } + }) + .unwrap(), + ); - for handle in handles { - handle.join().unwrap(); - } + handles.push( + thread::Builder::new() + .name("remover".to_string()) + .spawn({ + let flow_table = flow_table.clone(); + let flow_key = flow_keys[1]; + move || { + for _ in 0..N { + thread::yield_now(); + flow_table.remove(&flow_key); + } + } + }) + .unwrap(), + ); - // After all threads, flow[0] should be expired/gone (expirer thread ran). - // Since timers are not started in shuttle tests, the flow should be there - // but appear as Expired or Detached. Re-inserting a flow makes it active again, - // therefore, the only non-feasible status is Cancellled. - let found = flow_table.lookup(&flow_keys[0]).unwrap(); - assert_ne!(found.status(), FlowStatus::Cancelled); - }, - 100, + handles.push( + thread::Builder::new() + .name("lookup_and_lock".to_string()) + .spawn({ + let flow_table = flow_table.clone(); + let flow_key = flow_keys[1]; + move || { + for _ in 0..N { + thread::yield_now(); + if let Some(flow_info) = flow_table.lookup(&flow_key) { + let _guard = flow_info.locked.write(); + } + } + } + }) + .unwrap(), ); + + for handle in handles { + handle.join().unwrap(); + } + + // After all threads, flow[0] should be expired/gone (expirer thread ran). + // Since timers are not started in shuttle tests, the flow should be there + // but appear as Expired or Detached. Re-inserting a flow makes it active again, + // therefore, the only non-feasible status is Cancellled. + let found = flow_table.lookup(&flow_keys[0]).unwrap(); + assert_ne!(found.status(), FlowStatus::Cancelled); } - #[test] + #[concurrency::test] fn test_flow_table_reshard() { - shuttle::check_random( - move || { - let flow_table = Arc::new(FlowTable::default()); - - let five_seconds_from_now = Instant::now() + Duration::from_secs(5); - let flow_key1 = FlowKey::Unidirectional(FlowKeyData::new( - Some(VpcDiscriminant::VNI(Vni::new_checked(1).unwrap())), - "1.2.3.4".parse::().unwrap(), - "4.5.6.7".parse::().unwrap(), - IpProtoKey::Tcp(TcpProtoKey { - src_port: TcpPort::new_checked(1025).unwrap(), - dst_port: TcpPort::new_checked(2048).unwrap(), - }), - )); + let flow_table = Arc::new(FlowTable::default()); - let flow_key2 = FlowKey::Unidirectional(FlowKeyData::new( - Some(VpcDiscriminant::VNI(Vni::new_checked(10).unwrap())), - "10.2.3.4".parse::().unwrap(), - "40.5.6.7".parse::().unwrap(), - IpProtoKey::Tcp(TcpProtoKey { - src_port: TcpPort::new_checked(1025).unwrap(), - dst_port: TcpPort::new_checked(2048).unwrap(), - }), - )); - - let flow_table_clone1 = flow_table.clone(); - let flow_table_clone2 = flow_table.clone(); - let flow_table_clone3 = flow_table.clone(); - - let mut handles = vec![]; - - handles.push(thread::spawn(move || { - let flow_info = FlowInfo::new(flow_key1, five_seconds_from_now); - flow_table_clone1.insert(flow_info).unwrap(); - let result = flow_table_clone1.remove(&flow_key1).unwrap(); - assert!(result.0 == flow_key1); - })); - - handles.push(thread::spawn(move || { - let flow_info = FlowInfo::new(flow_key2, five_seconds_from_now); - flow_table_clone2.insert(flow_info).unwrap(); - let result = flow_table.remove(&flow_key2).unwrap(); - assert!(result.0 == flow_key2); - })); - - handles.push(thread::spawn(move || flow_table_clone3.reshard(128))); - - let _results: Vec<()> = handles - .into_iter() - .map(|handle| handle.join().unwrap()) - .collect(); - }, - 100, - ); + let five_seconds_from_now = Instant::now() + Duration::from_secs(5); + let flow_key1 = FlowKey::Unidirectional(FlowKeyData::new( + Some(VpcDiscriminant::VNI(Vni::new_checked(1).unwrap())), + "1.2.3.4".parse::().unwrap(), + "4.5.6.7".parse::().unwrap(), + IpProtoKey::Tcp(TcpProtoKey { + src_port: TcpPort::new_checked(1025).unwrap(), + dst_port: TcpPort::new_checked(2048).unwrap(), + }), + )); + + let flow_key2 = FlowKey::Unidirectional(FlowKeyData::new( + Some(VpcDiscriminant::VNI(Vni::new_checked(10).unwrap())), + "10.2.3.4".parse::().unwrap(), + "40.5.6.7".parse::().unwrap(), + IpProtoKey::Tcp(TcpProtoKey { + src_port: TcpPort::new_checked(1025).unwrap(), + dst_port: TcpPort::new_checked(2048).unwrap(), + }), + )); + + let flow_table_clone1 = flow_table.clone(); + let flow_table_clone2 = flow_table.clone(); + let flow_table_clone3 = flow_table.clone(); + + let mut handles = vec![]; + + handles.push(thread::spawn(move || { + let flow_info = FlowInfo::new(flow_key1, five_seconds_from_now); + flow_table_clone1.insert(flow_info).unwrap(); + let result = flow_table_clone1.remove(&flow_key1).unwrap(); + assert!(result.0 == flow_key1); + })); + + handles.push(thread::spawn(move || { + let flow_info = FlowInfo::new(flow_key2, five_seconds_from_now); + flow_table_clone2.insert(flow_info).unwrap(); + let result = flow_table.remove(&flow_key2).unwrap(); + assert!(result.0 == flow_key2); + })); + + handles.push(thread::spawn(move || flow_table_clone3.reshard(128))); + + let _results: Vec<()> = handles + .into_iter() + .map(|handle| handle.join().unwrap()) + .collect(); } } } diff --git a/flow-filter/src/tests.rs b/flow-filter/src/tests.rs index ba19724ef4..a78d9d14a2 100644 --- a/flow-filter/src/tests.rs +++ b/flow-filter/src/tests.rs @@ -626,7 +626,7 @@ fn test_flow_filter_packet_icmp_filtered() { assert_eq!(packet_out.meta().dst_vpcd, None); } -#[traced_test] +#[cfg_attr(not(emulated), traced_test)] #[test] fn test_flow_filter_table_from_overlay() { let vni1 = Vni::new_checked(100).unwrap(); @@ -734,7 +734,7 @@ fn test_flow_filter_table_from_overlay() { assert_eq!(packet_out.meta().dst_vpcd, None); } -#[traced_test] +#[cfg_attr(not(emulated), traced_test)] #[test] fn test_flow_filter_table_check_send_from_default() { let vni1 = Vni::new_checked(100).unwrap(); @@ -775,7 +775,7 @@ fn test_flow_filter_table_check_send_from_default() { assert_eq!(packet_out.meta().dst_vpcd, Some(vni2.into())); } -#[traced_test] +#[cfg_attr(not(emulated), traced_test)] #[test] fn test_flow_filter_table_check_default_to_default() { let vni1 = Vni::new_checked(100).unwrap(); @@ -822,7 +822,7 @@ fn test_flow_filter_table_check_default_to_default() { assert_eq!(packet_out.meta().dst_vpcd, Some(vni2.into())); } -#[traced_test] +#[cfg_attr(not(emulated), traced_test)] #[test] fn test_flow_filter_table_check_nat_requirements() { let vni1 = Vni::new_checked(100).unwrap(); @@ -951,7 +951,7 @@ fn test_flow_filter_table_check_nat_requirements() { assert!(needs_masquerade(&packet_out)); } -#[traced_test] +#[cfg_attr(not(emulated), traced_test)] #[test] fn test_flow_filter_table_check_stateful_nat_plus_peer_forwarding() { let vni1 = vni(100); @@ -1147,7 +1147,7 @@ fn test_flow_filter_table_check_stateful_nat_plus_peer_forwarding() { } #[test] -#[traced_test] +#[cfg_attr(not(emulated), traced_test)] fn test_flow_filter_protocol_aware_port_forwarding() { // Test that protocol-specific port forwarding correctly filters by L4 protocol. // Setup: TCP-only port forwarding overlapping with stateful NAT. @@ -1281,7 +1281,7 @@ fn test_flow_filter_protocol_aware_port_forwarding() { } #[test] -#[traced_test] +#[cfg_attr(not(emulated), traced_test)] fn test_flow_filter_protocol_any_port_forwarding() { // Test that L4Protocol::Any port forwarding works for both TCP and UDP packets. @@ -1359,7 +1359,7 @@ fn test_flow_filter_protocol_any_port_forwarding() { assert!(needs_port_forwarding(&packet_out)); } -#[traced_test] +#[cfg_attr(not(emulated), traced_test)] #[test] fn test_flow_filter_table_from_overlay_masquerade_port_forwarding_private_ips_overlap() { let vni1 = Vni::new_checked(100).unwrap(); @@ -1578,7 +1578,7 @@ fn test_flow_filter_table_from_overlay_masquerade_port_forwarding_private_ips_ov // forwarding, although the latter is restricted to specific ports. This test validates that // prefix splitting occurs correctly for this configuration, and that we find the right // destination and NAT requirements. -#[traced_test] +#[cfg_attr(not(emulated), traced_test)] #[test] fn test_flow_filter_table_from_overlay_masquerade_port_forwarding_private_ips_overlap_smaller_masquerade() { @@ -1783,7 +1783,7 @@ fn test_flow_filter_table_from_overlay_masquerade_port_forwarding_private_ips_ov assert!(needs_masquerade(&packet_out)); } -#[traced_test] +#[cfg_attr(not(emulated), traced_test)] #[test] fn test_flow_filter_table_from_overlay_masquerade_port_forwarding_private_ips_overlap_to_default() { let vni1 = Vni::new_checked(100).unwrap(); diff --git a/interface-manager/Cargo.toml b/interface-manager/Cargo.toml index 08c8567aff..0616bfb019 100644 --- a/interface-manager/Cargo.toml +++ b/interface-manager/Cargo.toml @@ -12,6 +12,7 @@ bolero = ["dep:bolero", "netdevsim", "net/test_buffer", "net/serde", "net/bolero [dependencies] # internal +concurrency = { workspace = true } net = { workspace = true, features = [] } rekon = { workspace = true } diff --git a/interface-manager/src/lib.rs b/interface-manager/src/lib.rs index 7d49139e1a..dcf6e4ab7a 100644 --- a/interface-manager/src/lib.rs +++ b/interface-manager/src/lib.rs @@ -15,8 +15,8 @@ #![allow(missing_docs)] // multi-index-map generates undocumented structures #![allow(clippy::unsafe_derive_deserialize)] // generated code uses unsafe +use concurrency::sync::Arc; use std::marker::PhantomData; -use std::sync::Arc; pub mod interface; pub mod tc; diff --git a/justfile b/justfile index 9b9d520dd9..c189138716 100644 --- a/justfile +++ b/justfile @@ -69,7 +69,8 @@ _cargo_profile_flag := if profile == "debug" { "" } else { "--profile " + profil # right answer: nextest walks every archived binary, the cfg-gated # ones contain no tests, and the loom-compatible ones run under their # `#[concurrency::test]`-routed `loom::model` body. -# Match all shuttle variants (`shuttle`, `shuttle_pct`, `shuttle_dfs`). +# Match all shuttle variants (`shuttle`, plus the additive +# `shuttle_dfs` opt-in). # Under any shuttle backend, `concurrency::sync` types ARE shuttle # primitives, and touching them outside a `shuttle::check_*`-wrapped # body panics with `ExecutionState NotSet`. Tests that are designed @@ -77,7 +78,7 @@ _cargo_profile_flag := if profile == "debug" { "" } else { "--profile " + profil # emits a `concurrency_model::` leaf -- the substring matches) # or live in a `*_shuttle` module / `*shuttle*` binary by convention. # Other workspace tests would fail spuriously without this filter. -filter := if features =~ "^shuttle" { "shuttle" } else { "" } +filter := if features =~ "^shuttle" { "shuttle" } else if features =~ "^loom" { "::concurrency_model::loom" } else { "" } # instrumentation mode (none/coverage) instrument := "none" diff --git a/k8s-intf/Cargo.toml b/k8s-intf/Cargo.toml index 586fb0caaa..5590c150d7 100644 --- a/k8s-intf/Cargo.toml +++ b/k8s-intf/Cargo.toml @@ -11,6 +11,7 @@ default = ["client"] # Disable this feature (default-features = false) for WASM/WASI builds that # only need the generated CRD types. client = [ + "dep:concurrency", "dep:futures", "dep:linkme", "dep:rustls", @@ -24,6 +25,7 @@ bolero = ["dep:bolero", "dep:net", "dep:lpm", "net/test_buffer", "net/bolero"] [dependencies] # internal +concurrency = { workspace = true, optional = true } lpm = { workspace = true, optional = true } net = { workspace = true, optional = true, features = ["bolero"] } tracectl = { workspace = true, optional = true } diff --git a/k8s-intf/src/bolero/support.rs b/k8s-intf/src/bolero/support.rs index ac20c8f022..b1880a381d 100644 --- a/k8s-intf/src/bolero/support.rs +++ b/k8s-intf/src/bolero/support.rs @@ -290,7 +290,7 @@ mod test { #[cfg(miri)] const UNIQUE_COUNTS: [u16; 4] = [0, 1, 10, 16]; const ITERATIONS: usize = cfg_select! { - miri => 3, + emulated => 3, _ => 1000, }; diff --git a/k8s-intf/src/client.rs b/k8s-intf/src/client.rs index c611a90c52..2b84b790e0 100644 --- a/k8s-intf/src/client.rs +++ b/k8s-intf/src/client.rs @@ -1,11 +1,11 @@ // SPDX-License-Identifier: Apache-2.0 // Copyright Open Network Fabric Authors +use concurrency::sync::Arc; use futures::{StreamExt, TryStreamExt}; use kube::api::PostParams; use kube::runtime::{WatchStreamExt, watcher}; use kube::{Api, Client}; -use std::sync::Arc; use std::time::Duration; use tracectl::trace_target; diff --git a/k8s-less/src/local.rs b/k8s-less/src/local.rs index 9b20a8a80d..8d9339c025 100644 --- a/k8s-less/src/local.rs +++ b/k8s-less/src/local.rs @@ -125,7 +125,7 @@ mod test { use tracing_test::traced_test; #[tokio::test] - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] #[ignore = "test is incorrect and needs reworked"] async fn test_kubeless() { let path = "/tmp/kubeless-dir"; diff --git a/left-right-tlcache/Cargo.toml b/left-right-tlcache/Cargo.toml index caccec2325..0b619c15f1 100644 --- a/left-right-tlcache/Cargo.toml +++ b/left-right-tlcache/Cargo.toml @@ -11,4 +11,5 @@ ahash = { workspace = true } thiserror = { workspace = true } [dev-dependencies] +concurrency = { workspace = true } serial_test = { workspace = true } diff --git a/left-right-tlcache/src/lib.rs b/left-right-tlcache/src/lib.rs index c8f7f2ec79..6b8cd1c44f 100644 --- a/left-right-tlcache/src/lib.rs +++ b/left-right-tlcache/src/lib.rs @@ -340,10 +340,10 @@ mod tests { #![allow(clippy::collapsible_if)] use super::*; + use concurrency::sync::Mutex; use left_right::{Absorb, ReadHandleFactory, WriteHandle}; #[cfg(not(miri))] use serial_test::serial; - use std::sync::Mutex; // Our left-right protected struct #[derive(Debug, Clone)] struct TestStruct { @@ -439,7 +439,7 @@ mod tests { if let Some(object) = self.data.get_mut(&key) { if let Some(writer_lock) = &mut object.writer { #[allow(clippy::mut_mutex_lock)] // lock exists just to make provider Sync - let mut writer = writer_lock.lock().unwrap(); + let mut writer = writer_lock.lock(); writer.append(TestStructChange::Update(data.to_owned())); writer.publish(); } @@ -656,7 +656,7 @@ mod tests { // build provider and populate it const NUM_HANDLES: u64 = cfg_select! { - miri => 10, + emulated => 10, _ => 1000, }; let mut provider = TestProvider::new(); diff --git a/mgmt/src/processor/confbuild/router.rs b/mgmt/src/processor/confbuild/router.rs index 4e583c554d..3e7d3ce5f5 100644 --- a/mgmt/src/processor/confbuild/router.rs +++ b/mgmt/src/processor/confbuild/router.rs @@ -3,7 +3,7 @@ //! Functions to build router configurations -use std::sync::Arc; +use concurrency::sync::Arc; use netdev::Interface as NetDevInterface; use netdev::get_interfaces; diff --git a/mgmt/src/processor/gwconfigdb.rs b/mgmt/src/processor/gwconfigdb.rs index 142ac3bce7..32d98b9b75 100644 --- a/mgmt/src/processor/gwconfigdb.rs +++ b/mgmt/src/processor/gwconfigdb.rs @@ -4,8 +4,8 @@ //! Configuration database use crate::processor::confbuild::internal::build_internal_config; +use concurrency::sync::Arc; use config::{ConfigSummary, GenId, GwConfigMeta, ValidatedGwConfig}; -use std::sync::Arc; use tracing::{debug, info}; /// Configuration database, keeps a set of [`GwConfig`]s keyed by generation id [`GenId`] diff --git a/mgmt/src/processor/k8s_client.rs b/mgmt/src/processor/k8s_client.rs index dba96a48d9..208c0f62ee 100644 --- a/mgmt/src/processor/k8s_client.rs +++ b/mgmt/src/processor/k8s_client.rs @@ -3,7 +3,7 @@ //! K8s client that glues the gateway to k8s -use std::sync::Arc; +use concurrency::sync::Arc; use std::time::SystemTime; use chrono::{TimeZone, Utc}; diff --git a/mgmt/src/processor/k8s_less_client.rs b/mgmt/src/processor/k8s_less_client.rs index 22c0626da8..8a44793d9f 100644 --- a/mgmt/src/processor/k8s_less_client.rs +++ b/mgmt/src/processor/k8s_less_client.rs @@ -4,11 +4,11 @@ //! "Kubeless" client that learns configs from a directory and requests //! the configuration processor to apply them. +use concurrency::sync::Arc; use config::{ExternalConfig, GwConfig}; use futures::TryFutureExt; use k8s_less::kubeless_watch_gateway_agent_crd; use std::path::PathBuf; -use std::sync::Arc; use tokio::fs::create_dir_all; use tracing::{error, info}; diff --git a/mgmt/src/processor/mgmt_client.rs b/mgmt/src/processor/mgmt_client.rs index d93a4ea954..01bbb3c5a3 100644 --- a/mgmt/src/processor/mgmt_client.rs +++ b/mgmt/src/processor/mgmt_client.rs @@ -9,7 +9,7 @@ use config::GenId; use config::internal::status::DataplaneStatus; use config::{GwConfig, ValidatedGwConfig}; -use std::sync::Arc; +use concurrency::sync::Arc; use tokio::sync::mpsc::Sender; use tokio::sync::oneshot; use tokio::sync::oneshot::Receiver; diff --git a/mgmt/src/tests/mgmt.rs b/mgmt/src/tests/mgmt.rs index 3147d5adba..508f6099cb 100644 --- a/mgmt/src/tests/mgmt.rs +++ b/mgmt/src/tests/mgmt.rs @@ -388,7 +388,7 @@ pub mod test { .expect("Should succeed") } - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] #[test] fn check_frr_config() { /* Not really a test but a tool to check generated FRR configs given a gateway config */ diff --git a/mgmt/src/vpc_manager/mod.rs b/mgmt/src/vpc_manager/mod.rs index 48cbb2fce2..f8144fcc8e 100644 --- a/mgmt/src/vpc_manager/mod.rs +++ b/mgmt/src/vpc_manager/mod.rs @@ -3,6 +3,7 @@ use crate::processor::confbuild::namegen::VpcInterfacesNames; +use concurrency::sync::Arc; use config::InternalConfig; use config::internal::interfaces::interface::{InterfaceConfigTable, InterfaceType}; use config::internal::routing::evpn::VtepConfig; @@ -29,7 +30,6 @@ use rekon::{Observe, Op, Reconcile, Remove}; use rtnetlink::Handle; use serde::{Deserialize, Serialize}; use std::marker::PhantomData; -use std::sync::Arc; use tracing::{debug, error, warn}; #[derive(Clone, Debug)] diff --git a/mgmt/tests/reconcile.rs b/mgmt/tests/reconcile.rs index ed9bd6ced7..ea083634aa 100644 --- a/mgmt/tests/reconcile.rs +++ b/mgmt/tests/reconcile.rs @@ -4,6 +4,7 @@ use dataplane_mgmt as mgmt; use caps::Capability; +use concurrency::sync::Arc; use fixin::wrap; use interface_manager::interface::{ BridgePropertiesSpec, InterfaceAssociationSpec, InterfacePropertiesSpec, InterfaceSpecBuilder, @@ -19,7 +20,6 @@ use net::vxlan::Vxlan; use rekon::{Observe, Reconcile}; use rtnetlink::sys::AsyncSocket; use std::net::Ipv4Addr; -use std::sync::Arc; use std::time::Duration; use test_utils::with_caps; use tracing::info; @@ -28,7 +28,7 @@ use tracing_test::traced_test; #[test] #[n_vm::in_vm] #[wrap(with_caps([Capability::CAP_NET_ADMIN]))] -#[traced_test] +#[cfg_attr(not(emulated), traced_test)] fn reconcile_fuzz() { let runtime = tokio::runtime::Builder::new_current_thread() .enable_io() @@ -41,7 +41,14 @@ fn reconcile_fuzz() { panic!("failed to create connection"); }; tokio::spawn(connection); - std::sync::Mutex::new(Arc::new(handle)) + // Bolero's `for_each` uses `catch_unwind` internally, which + // requires the captured state to be `RefUnwindSafe`. The + // parking_lot Mutex behind `concurrency::sync::Mutex` is not, + // because its inner `UnsafeCell` lacks the explicit + // `RefUnwindSafe` impl that `std::sync::Mutex` provides. + // Reach into std here on purpose. + #[allow(clippy::disallowed_types)] + std::sync::Mutex::new(Arc::new(handle)) // nosemgrep: rust-no-direct-std-sync-import }); bolero::check!() .with_type() @@ -78,7 +85,7 @@ fn reconcile_fuzz() { #[allow(clippy::too_many_lines)] // this is an integration test and is expected to be long #[tokio::test] #[wrap(with_caps([Capability::CAP_NET_ADMIN]))] -#[traced_test] +#[cfg_attr(not(emulated), traced_test)] async fn reconcile_demo() { let mut required_interface_map = MultiIndexInterfaceSpecMap::default(); let interfaces = [ diff --git a/miri.just b/miri.just index 3f4caae684..0d680962f3 100644 --- a/miri.just +++ b/miri.just @@ -47,8 +47,10 @@ test *args="": if [ "${layout_seed}" = "random" ]; then layout_seed="${RANDOM}" fi - RUSTFLAGS+="-Zrandomize-layout -Zlayout-seed=${layout_seed}" + RUSTFLAGS+="-Zrandomize-layout -Zlayout-seed=${layout_seed} " fi + # Umbrella cfg shared with the qemu-user path in nix/profiles.nix. + RUSTFLAGS+="--cfg=emulated" declare -rx RUSTFLAGS declare -ra cmd=("nice" "-n" "19" "cargo" "miri" "nextest" "run" "--profile=miri" "--target=${target}") echo "Running Miri with args: {{ args }}" diff --git a/nat/Cargo.toml b/nat/Cargo.toml index acf31e6598..3c7378da7b 100644 --- a/nat/Cargo.toml +++ b/nat/Cargo.toml @@ -6,9 +6,9 @@ publish.workspace = true version.workspace = true [features] +loom = ["concurrency/loom"] shuttle = ["concurrency/shuttle", "dep:shuttle"] -shuttle_pct = ["concurrency/shuttle_pct", "shuttle"] -shuttle_dfs = ["concurrency/shuttle_dfs", "shuttle_pct"] +shuttle_dfs = ["concurrency/shuttle_dfs", "shuttle"] [dependencies] ahash = { workspace = true, features = ["std"] } diff --git a/nat/src/lib.rs b/nat/src/lib.rs index 8738823ef0..d7cb3037d7 100644 --- a/nat/src/lib.rs +++ b/nat/src/lib.rs @@ -1,6 +1,10 @@ // SPDX-License-Identifier: Apache-2.0 // Copyright Open Network Fabric Authors +// Loom's backend wraps `Arc` in a local newtype that isn't a blessed +// self-receiver; needed for the `self: Arc` methods on +// `AllocatedIp` / `AllocatedPortBlock`. +#![cfg_attr(feature = "loom", feature(arbitrary_self_types))] #![deny(clippy::all, clippy::pedantic)] #![deny(rustdoc::all)] #![allow(clippy::missing_errors_doc)] diff --git a/nat/src/portfw/portfwtable/access.rs b/nat/src/portfw/portfwtable/access.rs index d2df4a9768..418a35042c 100644 --- a/nat/src/portfw/portfwtable/access.rs +++ b/nat/src/portfw/portfwtable/access.rs @@ -113,7 +113,7 @@ mod test { } #[test] - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] fn test_port_forwarding_access_remove_rules_drops_refs() { let mut pfw_table_w = PortFwTableWriter::new(); let reader = pfw_table_w.reader(); @@ -143,7 +143,7 @@ mod test { } #[test] - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] fn test_port_forwarding_access_modify_rules_keeps_refs() { let mut pfw_table_w = PortFwTableWriter::new(); let reader = pfw_table_w.reader(); @@ -182,7 +182,7 @@ mod test { } #[test] - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] fn test_port_forwarding_access_replace_ruleset() { let mut pfwt_w = PortFwTableWriter::new(); let reader = pfwt_w.reader(); diff --git a/nat/src/portfw/portfwtable/objects.rs b/nat/src/portfw/portfwtable/objects.rs index 6fa24b5066..bbf13cd43c 100644 --- a/nat/src/portfw/portfwtable/objects.rs +++ b/nat/src/portfw/portfwtable/objects.rs @@ -533,7 +533,7 @@ mod test { } #[test] - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] fn test_port_forwarding_table_removals() { let mut fwtable = build_sample_port_forwarding_table(); let key = PortFwKey { @@ -571,7 +571,7 @@ mod test { } #[test] - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] fn test_port_forwarding_table_updates() { let mut fwtable = PortFwTable::new(); let key = PortFwKey { diff --git a/nat/src/portfw/test.rs b/nat/src/portfw/test.rs index ad4c08d6a8..75479f9fed 100644 --- a/nat/src/portfw/test.rs +++ b/nat/src/portfw/test.rs @@ -218,7 +218,7 @@ mod nf_test { (flow_table, pipeline, writer) } - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] #[tokio::test] async fn test_nf_port_forwarding_base() { let ruleset = build_test_port_forwarding_ruleset(); @@ -270,7 +270,7 @@ mod nf_test { ); } - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] #[test] fn test_nf_port_forwarding_tcp_filtered() { let ruleset = build_test_port_forwarding_ruleset(); @@ -318,7 +318,7 @@ mod nf_test { ); } - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] #[tokio::test] async fn test_nf_port_forwarding_tcp_establishment() { let ruleset = build_test_port_forwarding_ruleset(); @@ -330,7 +330,7 @@ mod nf_test { establish_tcp_connection(&mut pipeline); } - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] #[tokio::test] async fn test_nf_port_forwarding_tcp_close_server() { let ruleset = build_test_port_forwarding_ruleset(); @@ -368,7 +368,7 @@ mod nf_test { assert_eq!(flow_table.len().unwrap(), 2); } - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] #[tokio::test] async fn test_nf_port_forwarding_tcp_close_client() { let ruleset = build_test_port_forwarding_ruleset(); @@ -404,7 +404,7 @@ mod nf_test { assert_eq!(flow_table.len().unwrap(), 2); } - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] #[tokio::test] async fn test_nf_port_forwarding_tcp_half_close_client() { let ruleset = build_test_port_forwarding_ruleset(); @@ -450,7 +450,7 @@ mod nf_test { assert_eq!(flow_table.len().unwrap(), 2); } - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] #[tokio::test] async fn test_nf_port_forwarding_tcp_reset() { let ruleset = build_test_port_forwarding_ruleset(); @@ -487,7 +487,7 @@ mod nf_test { println!("{flow_table}"); } - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] #[tokio::test] async fn test_nf_port_forwarding_config_removal_interrupts_traffic() { let ruleset = build_test_port_forwarding_ruleset(); @@ -609,7 +609,7 @@ mod nf_test { state.rule().upgrade().is_some() } - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] #[tokio::test] async fn test_nf_port_forwarding_with_port_ranges() { let ruleset = build_test_port_forwarding_table_with_ranges(); @@ -650,7 +650,7 @@ mod nf_test { println!("{flow_table}"); } - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] #[tokio::test] async fn test_nf_port_forwarding_with_prefixes_and_port_ranges() { let ruleset = build_test_port_forwarding_table_with_prefixes_and_port_ranges(); @@ -691,7 +691,7 @@ mod nf_test { println!("{flow_table}"); } - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] #[tokio::test] async fn test_nf_port_forwarding_compatible_rule_updates_preserves_flows() { // check that, when updating a rule, existing flows remain if the new rule would allow them @@ -745,7 +745,7 @@ mod nf_test { println!("{flow_table}"); } - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] #[tokio::test] async fn test_nf_port_forwarding_incompatible_rule_updates_remove_flows() { // check that, when updating a rule, existing flows remain if the new rule would allow them diff --git a/nat/src/stateful/apalloc/port_alloc.rs b/nat/src/stateful/apalloc/port_alloc.rs index 46581aa1a3..e4fd8a407c 100644 --- a/nat/src/stateful/apalloc/port_alloc.rs +++ b/nat/src/stateful/apalloc/port_alloc.rs @@ -15,11 +15,11 @@ use crate::stateful::allocation::AllocatorError; use concurrency::concurrency_mode; use concurrency::sync::atomic::{AtomicBool, AtomicU16, AtomicUsize}; use concurrency::sync::{Arc, Mutex, RwLock, Weak}; +use concurrency::thread::ThreadId; use config::GenId; use lpm::prefix::PortRange; use std::collections::{BTreeSet, HashMap}; use std::fmt::Display; -use std::thread::ThreadId; use tracing::debug; @@ -161,6 +161,12 @@ impl PortAllocator { } } + #[concurrency_mode(loom)] + fn shuffle_slice(_slice: &mut [T]) { + // Loom needs determinism for replay; shuffle is an allocation-order + // heuristic, not a correctness property. + } + // Iterate over the slice of all blocks, but starting from a given offset (and looping at the // end), returning the block and its index from the initial slice. // @@ -636,13 +642,15 @@ impl ThreadPortMap { fn get(&self) -> Option { self.0 .read() - .get(&std::thread::current().id()) + .get(&concurrency::thread::current().id()) .copied() .unwrap_or(None) } fn set(&self, index: Option) { - self.0.write().insert(std::thread::current().id(), index); + self.0 + .write() + .insert(concurrency::thread::current().id(), index); } } diff --git a/nat/src/stateful/apalloc/test_alloc.rs b/nat/src/stateful/apalloc/test_alloc.rs index b7af37449d..f9aafca962 100644 --- a/nat/src/stateful/apalloc/test_alloc.rs +++ b/nat/src/stateful/apalloc/test_alloc.rs @@ -188,8 +188,6 @@ mod tests { mod std_tests { use super::context::*; use crate::stateful::apalloc::PoolTableKey; - use concurrency::sync::Arc; - use concurrency::thread; use net::ip::NextHeader; #[test] @@ -380,14 +378,22 @@ mod std_tests { assert_eq!(bitmap.len(), 2); // 2 free IP addresses left to NAT 1.1.0.0 (UDP) assert_eq!(in_use.len(), 1); // 1 allocated, in use } +} - // This test is NOT a shuttle test. It validates that a basic example with threads works - // with or without shuttle components (depending on how we compile), as a control test in - // case shuttle tests do not work. For example, it helped understand that memory usage for - // Atomics is different in shuttle than in std, and that just testing simple allocations as - // we do here was not broken - we just needed to increase stack memory for shuttle's runner. - #[test] - fn test_concurrent_allocations_without_shuttle() { +// Loom is excluded: its `Weak` shim never upgrades to `None`, so the +// allocator's `Weak`-as-liveness pattern keeps dropped entries alive +// forever and trips loom's end-of-execution `Arc leaked` assertion. +// See `concurrency::sync` for the shim caveat. +#[cfg(not(feature = "loom"))] +mod concurrency_tests { + use super::context::*; + use super::tests; + use concurrency::sync::Arc; + use concurrency::thread; + use net::ip::NextHeader; + + #[concurrency::test] + fn test_concurrent_allocations_two_ips() { let allocator = build_allocator(); let allocator1 = Arc::new(allocator); let allocator2 = allocator1.clone(); @@ -405,81 +411,26 @@ mod std_tests { t1.join().unwrap(); t2.join().unwrap(); } -} - -#[concurrency_mode(shuttle)] -mod tests_shuttle { - use super::context::*; - use super::tests; - use concurrency::sync::{Arc, Mutex}; - use concurrency::thread; - use net::ip::NextHeader; + #[concurrency::test] + fn test_concurrent_allocations_three_workers() { + tests::concurrent_allocations(); + } + // Default backend's one-shot run is non-deterministic, so this only + // runs under a model checker -- where the race is actually reachable. + #[cfg(any(feature = "loom", feature = "shuttle"))] + #[concurrency::test] #[should_panic(expected = "assertion `left == right` failed")] - #[test] fn test_ensure_shuttle_works() { - shuttle::check_random( - || { - let lock = Arc::new(Mutex::new(0u64)); - let lock2 = lock.clone(); - - thread::spawn(move || { - *lock.lock() = 1; - }); - - assert_eq!(0, *lock2.lock()); - }, - 100, - ); - } - - fn shuttle_config() -> shuttle::Config { - let mut config = shuttle::Config::new(); - // Raise the stack size to avoid stack overflow in the coroutine. The default is 32 kB, but - // the allocator uses Atomics for all port blocks for each allocated IP address, and in - // shuttle an AtomicBool takes over 100 bytes in memory, for example. - // - // Raise to 1 MB stack. - config.stack_size = 1024 * 1024; - config - } - - fn run_shuttle_random(f: F) - where - F: Fn() + Sync + Send + 'static, - { - let config = shuttle_config(); - // One hundred iterations - let runner = shuttle::Runner::new(shuttle::scheduler::RandomScheduler::new(100), config); - runner.run(f); - } + use concurrency::sync::Mutex; + let lock = Arc::new(Mutex::new(0u64)); + let lock2 = lock.clone(); - fn run_shuttle_pct(f: F) - where - F: Fn() + Sync + Send + 'static, - { - let config = shuttle_config(); - // replay test under 64 different schedules - const ITERATIONS: usize = 64; - // max of 4 preemption points per schedule - const PREEMPTIONS: usize = 4; // this is pretty aggressive, very rarely is larger than 3 useful. - let runner = shuttle::Runner::new( - shuttle::scheduler::PctScheduler::new(PREEMPTIONS, ITERATIONS), - config, - ); - runner.run(f); - } - - // Run concurrent allocations for four different tuples (some of them sharing the same source - // and destination IP addresses) using shuttle's random scheduler, see if anything breaks. - #[test] - fn test_concurrent_allocations_shuttle_random() { - run_shuttle_random(tests::concurrent_allocations); - } + thread::spawn(move || { + *lock.lock() = 1; + }); - #[test] - fn test_concurrent_allocations_shuttle_pct() { - run_shuttle_pct(tests::concurrent_allocations); + assert_eq!(0, *lock2.lock()); } } diff --git a/nat/src/stateful/test.rs b/nat/src/stateful/test.rs index da5b649e58..6628c795d2 100644 --- a/nat/src/stateful/test.rs +++ b/nat/src/stateful/test.rs @@ -615,7 +615,7 @@ fn check_packet_icmp_echo_new( } #[tokio::test] -#[traced_test] +#[cfg_attr(not(emulated), traced_test)] async fn test_icmp_echo_nat() { let config = build_gwconfig_from_overlay(build_overlay_2vpcs()) .validate() @@ -782,7 +782,7 @@ fn check_packet_icmp_error( } #[tokio::test] -#[traced_test] +#[cfg_attr(not(emulated), traced_test)] async fn test_icmp_error_nat() { // build setup: 2 vpcs with masquerading, vni 100 -> vni 200 let (flow_table, mut pipeline, _allocw) = test_setup(1, &build_overlay_2vpcs()); @@ -1418,7 +1418,7 @@ fn build_overlay_2vpcs_unidirectional_nat_overlapping_exposes() -> Overlay { } #[tokio::test] -#[cfg_attr(not(miri), traced_test)] +#[cfg_attr(not(emulated), traced_test)] #[allow(clippy::too_many_lines)] async fn test_full_config_unidirectional_nat_overlapping_exposes_for_single_peering() { let config = @@ -1728,7 +1728,7 @@ fn establish_tcp_connection(pipeline: &mut DynPipeline) { } #[tokio::test] -#[traced_test] +#[cfg_attr(not(emulated), traced_test)] async fn test_masquerade_tcp_establish() { // build setup: 2 vpcs with masquerading (vni 100 -> vni 200) let (flow_table, mut pipeline, _allocw) = test_setup(1, &build_overlay_2vpcs()); @@ -1737,7 +1737,7 @@ async fn test_masquerade_tcp_establish() { } #[tokio::test] -#[traced_test] +#[cfg_attr(not(emulated), traced_test)] async fn test_masquerade_check() { // build setup: 2 vpcs with masquerading (vni 100 -> vni 200) let (flow_table, mut pipeline, _allocw) = test_setup(1, &build_overlay_2vpcs()); @@ -1777,7 +1777,7 @@ async fn test_masquerade_check() { } #[tokio::test] -#[traced_test] +#[cfg_attr(not(emulated), traced_test)] async fn test_masquerade_tcp_reset() { // build setup: 2 vpcs with masquerading (vni 100 -> vni 200) let (flow_table, mut pipeline, _allocw) = test_setup(1, &build_overlay_2vpcs()); @@ -1812,7 +1812,7 @@ async fn test_masquerade_tcp_reset() { } #[tokio::test] -#[traced_test] +#[cfg_attr(not(emulated), traced_test)] async fn test_masquerade_reconfig_keep_flow() { let genid = 1; // build setup: 2 vpcs with masquerading (vni 100 -> vni 200) @@ -1848,7 +1848,7 @@ async fn test_masquerade_reconfig_keep_flow() { } #[tokio::test] -#[traced_test] +#[cfg_attr(not(emulated), traced_test)] async fn test_masquerade_reconfig_drop_flow() { let genid = 1; // build setup: 2 vpcs with masquerading (vni 100 -> vni 200) diff --git a/nat/src/stateless/test.rs b/nat/src/stateless/test.rs index c0b87a1ef6..de77a58150 100644 --- a/nat/src/stateless/test.rs +++ b/nat/src/stateless/test.rs @@ -837,7 +837,7 @@ fn check_packet_with_ports( } #[test] -#[traced_test] +#[cfg_attr(not(emulated), traced_test)] fn test_config_with_port_ranges_basic() { let expose1 = VpcExpose::empty() .make_stateless_nat() @@ -898,7 +898,7 @@ fn test_config_with_port_ranges_basic() { } #[test] -#[traced_test] +#[cfg_attr(not(emulated), traced_test)] #[allow(clippy::too_many_lines)] fn test_config_with_port_ranges_complex() { let expose1 = VpcExpose::empty() @@ -1137,7 +1137,7 @@ fn test_config_with_port_ranges_complex() { } #[test] -#[traced_test] +#[cfg_attr(not(emulated), traced_test)] fn test_config_with_port_ranges_with_default() { let expose1 = VpcExpose::empty() .make_stateless_nat() diff --git a/net/src/packet/hash.rs b/net/src/packet/hash.rs index 4f90ba6cf8..cbf1aa32cf 100644 --- a/net/src/packet/hash.rs +++ b/net/src/packet/hash.rs @@ -110,7 +110,7 @@ mod tests { let start: u64 = 4; let end: u64 = 10; let num_packets: u64 = cfg_select! { - miri => 20, + emulated => 20, _ => 2000, }; let packets = build_test_packets(num_packets.try_into().unwrap()); diff --git a/nix/pkgs/dplane-plugin/default.nix b/nix/pkgs/dplane-plugin/default.nix index b6a68db7b9..f48f184fcb 100644 --- a/nix/pkgs/dplane-plugin/default.nix +++ b/nix/pkgs/dplane-plugin/default.nix @@ -20,15 +20,6 @@ stdenv.mkDerivation (finalAttrs: { version = sources.dplane-plugin.revision; src = sources.dplane-plugin.outPath; - # workaround: src/hh_dp_msg.c reaches into a glibc-internal anonymous - # union name (`.__in6_u.__u6_addr8`) on struct in6_addr. musl exposes - # the POSIX-standard `.s6_addr` member directly without that wrapping - # union, so the access fails to compile. - # remove once fixed upstream in githedgehog/dplane-plugin. - postPatch = '' - sed -i 's/\.__in6_u\.__u6_addr8/.s6_addr/g' src/hh_dp_msg.c - ''; - doCheck = false; doFixup = false; enableParallelBuilding = true; diff --git a/nix/profiles.nix b/nix/profiles.nix index c18158fee4..a168e28054 100644 --- a/nix/profiles.nix +++ b/nix/profiles.nix @@ -2,6 +2,7 @@ # Copyright Open Network Fabric Authors { arch, + host-arch, profile, sanitizers, instrumentation, @@ -21,6 +22,9 @@ let # failing test aborts the runner and leaks state (netns / caps). needs-unwind = for-tests || builtins.elem "loom" cargo-features || builtins.elem "shuttle" cargo-features; + # Test archives are loaded back onto the build host and executed + # there; when the target arch differs, that means qemu-user. + is-emulated-test = for-tests && (arch != host-arch); common.NIX_CFLAGS_COMPILE = [ "-g3" "-gdwarf-5" @@ -38,6 +42,9 @@ let ]; common.RUSTFLAGS = [ "--cfg=tokio_unstable" + # Register `emulated` so `#[cfg_attr(emulated, ...)]` never trips + # `unexpected_cfgs`; only *set* for is-emulated-test and miri. + "--check-cfg=cfg(emulated)" "-Cdebuginfo=full" "-Cdwarf-version=5" "-Csymbol-mangling-version=v0" @@ -52,6 +59,7 @@ let "-Cpanic=abort" ] ) + ++ (if is-emulated-test then [ "--cfg=emulated" ] else [ ]) ++ (map (flag: "-Clink-arg=${flag}") common.NIX_CFLAGS_LINK); optimize-for.debug.NIX_CFLAGS_COMPILE = [ "-fno-inline" diff --git a/npins/sources.json b/npins/sources.json index 7f0a509903..00b9854857 100644 --- a/npins/sources.json +++ b/npins/sources.json @@ -54,9 +54,9 @@ }, "branch": "master", "submodules": false, - "revision": "ef3e718651d59fd4da5787a9c05e06a594c0136c", - "url": "https://github.com/githedgehog/dplane-plugin/archive/ef3e718651d59fd4da5787a9c05e06a594c0136c.tar.gz", - "hash": "sha256-CRsHKk50XnV23uVJxjN9ZtsIFH/BwZYlW27UL4V0D6E=" + "revision": "f3517e68ef53955f107993b45c64222576e2e42d", + "url": "https://github.com/githedgehog/dplane-plugin/archive/f3517e68ef53955f107993b45c64222576e2e42d.tar.gz", + "hash": "sha256-C5Hn5L7TVwmh4q4EGKHs5fxx6DX9lTqSGHfQTz6lVuM=" }, "dplane-rpc": { "type": "Git", diff --git a/routing/Cargo.toml b/routing/Cargo.toml index 3c941805cc..74e4459638 100644 --- a/routing/Cargo.toml +++ b/routing/Cargo.toml @@ -6,6 +6,9 @@ publish.workspace = true version.workspace = true [features] +loom = ["concurrency/loom"] +shuttle = ["concurrency/shuttle"] +shuttle_dfs = ["concurrency/shuttle_dfs", "shuttle"] testing = [] [dependencies] diff --git a/routing/src/atable/resolver.rs b/routing/src/atable/resolver.rs index ec72510ef8..c86f342e99 100644 --- a/routing/src/atable/resolver.rs +++ b/routing/src/atable/resolver.rs @@ -168,8 +168,8 @@ pub mod tests { #[test] #[cfg_attr( - miri, - ignore = "reads /proc/net/arp and queries kernel interfaces, neither available under miri" + emulated, + ignore = "reads /proc/net/arp and queries kernel interfaces, neither available under miri and not reliable under qemu" )] fn test_adjacency_resolver() { let (mut resolver, atabler) = AtResolver::new(true); diff --git a/routing/src/cli/handler.rs b/routing/src/cli/handler.rs index dae3ccdd69..71d5f38403 100644 --- a/routing/src/cli/handler.rs +++ b/routing/src/cli/handler.rs @@ -22,11 +22,11 @@ use crate::routingdb::RoutingDb; use chrono::Local; use cli::cliproto::{CliAction, CliError, CliRequest, CliResponse, RequestArgs, RouteProtocol}; +use concurrency::sync::Arc; use config::{ConfigSummary, GwConfigMeta, ValidatedGwConfig}; use lpm::prefix::{Ipv4Prefix, Ipv6Prefix}; use net::vxlan::Vni; use std::os::unix::net::SocketAddr; -use std::sync::Arc; use common::cliprovider::{CliDataProvider, Heading}; use strum::IntoEnumIterator; diff --git a/routing/src/config/mod.rs b/routing/src/config/mod.rs index 8b65d3b378..9e7b0199d3 100644 --- a/routing/src/config/mod.rs +++ b/routing/src/config/mod.rs @@ -335,7 +335,7 @@ mod tests { Ok(()) } - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] #[test] fn test_config_initial() { let mut db = create_routing_database(); @@ -343,7 +343,7 @@ mod tests { test_apply_config(&config, &mut db).expect("Should succeed"); } - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] #[test] fn test_config_invalid() { let mut db = create_routing_database(); @@ -361,7 +361,7 @@ mod tests { assert!(result.is_err_and(|e| matches!(e, RouterError::InvalidConfig(_)))); } - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] #[test] fn test_config_reapply() { let mut db = create_routing_database(); @@ -371,7 +371,7 @@ mod tests { test_apply_config(&config, &mut db).expect("Should succeed"); } - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] #[test] fn test_config_reconfig_vrf_name_and_vni() { let mut db = create_routing_database(); @@ -389,7 +389,7 @@ mod tests { test_apply_config(&config, &mut db).expect("Should succeed"); } - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] #[test] fn test_config_reconfig_vnis() { let mut db = create_routing_database(); @@ -407,7 +407,7 @@ mod tests { test_apply_config(&config, &mut db).expect("Should succeed"); } - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] #[test] fn test_config_swap_vrf_vnis() { let mut db = create_routing_database(); @@ -430,7 +430,7 @@ mod tests { } - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] #[test] fn test_config_change_interface() { let mut db = create_routing_database(); diff --git a/routing/src/fib/test.rs b/routing/src/fib/test.rs index 8e47598b62..74a39c4414 100644 --- a/routing/src/fib/test.rs +++ b/routing/src/fib/test.rs @@ -24,15 +24,13 @@ mod tests { use lpm::prefix::{IpAddr, Prefix}; + use concurrency::sync::Arc; + use concurrency::sync::atomic::AtomicU16; + use concurrency::thread::Builder; use rand::RngExt; use rand::rngs::ThreadRng; use std::str::FromStr; - use std::sync::Arc; - use std::sync::atomic::AtomicBool; - use std::sync::atomic::AtomicU16; - use std::thread; - use std::thread::Builder; - use std::time::{Duration, Instant}; + use std::time::Instant; use std::{collections::HashMap, collections::HashSet, sync::atomic::Ordering}; use crate::fib::fibgroupstore::tests::build_fib_entry_egress; @@ -113,7 +111,7 @@ mod tests { #[test] fn test_concurrency_fib() { const NUM_PACKETS: u64 = cfg_select! { - miri => 50, + emulated => 50, _ => 100_000, }; const NUM_WORKERS: u16 = 4; @@ -254,7 +252,7 @@ mod tests { // number of threads looking up fibtable const NUM_WORKERS: u16 = 6; const NUM_PACKETS: u64 = cfg_select! { - miri => 30, + emulated => 30, _ => 100_000, }; const TENTH: u64 = NUM_PACKETS / 10; @@ -281,7 +279,7 @@ mod tests { let handle = Builder::new() .name(format!("WORKER-{n}")) .spawn(move || { - #[cfg(not(miri))] + #[cfg(not(emulated))] println!("Worker-{n} started"); let mut rng = rand::rng(); let mut packet = test_packet(); @@ -297,7 +295,7 @@ mod tests { if hit == prefix { prefix_hits += 1; if prefix_hits.is_multiple_of(TENTH) { - #[cfg(not(miri))] + #[cfg(not(emulated))] println!( "Worker {n} is {} % done", prefix_hits * 100 / NUM_PACKETS @@ -317,7 +315,7 @@ mod tests { nofibs += 1; } } - #[cfg(not(miri))] + #[cfg(not(emulated))] { println!("=== Worker {n} finished ===="); println!("Stats:"); @@ -379,7 +377,7 @@ mod tests { // stop when all workers are done if done.load(Ordering::Relaxed) == NUM_WORKERS { - #[cfg(not(miri))] + #[cfg(not(emulated))] println!("All workers finished!"); if let Some(fib) = fibw.take() { // fib is destroyed here @@ -394,13 +392,102 @@ mod tests { } let duration = start.elapsed(); - #[cfg(not(miri))] + #[cfg(not(emulated))] println!("Test duration: {duration:?}"); } + // Tests fib reader utilities returning guards #[test] + fn test_fib_guards() { + // create fib + let (mut fibw, fibr) = FibWriter::new(FibKey::Id(0)); + + // add a route + let prefix = Prefix::from("192.168.1.0/24"); + let nhkey = NhopKey::with_address(&IpAddr::from_str("7.0.0.1").unwrap()); + let e1 = build_fib_entry_egress(1, "10.0.1.1", "eth1"); + let fibgroup1 = build_fibgroup(std::slice::from_ref(&e1)); + fibw.register_fibgroup(&nhkey, &fibgroup1, false); + fibw.add_fibroute(prefix, vec![nhkey.clone()], false); + fibw.publish(); + + // use the fib: do lpm for some destination and get a route + let destination = IpAddr::from_str("192.168.1.1").expect("Bad dst ip address"); + let route = &*fibr.lpm_route(destination).unwrap(); + assert!(route.has_entries()); + assert_eq!(route.len(), fibgroup1.len()); + assert!(route.iter().any(|g| g == &fibgroup1)); + + // use again the fib with a packet + let packet = test_packet(); + let (matched, entry1) = fibr.lpm_entry_prefix(&packet).unwrap(); + assert_eq!(matched, prefix); + assert!(fibgroup1.entries().contains(&*entry1)); + + // attempt to modify the route by modifying the fibgroup + let e2 = build_fib_entry_egress(2, "10.0.2.1", "eth2"); + let fibgroup2 = build_fibgroup(std::slice::from_ref(&e2)); + fibw.register_fibgroup(&nhkey, &fibgroup2, false); + fibw.add_fibroute(prefix, vec![nhkey.clone()], false); + fibw.publish(); + + // a second query to the fib yields the updated value. This is counter-intuitive but correct, + // because of the number of readers we have and the fact that the original fib guard was forgotten. + let (_, entry2) = fibr.lpm_entry_prefix(&packet).unwrap(); + assert_eq!(&*entry2, &e2); + + // ... but the guard to the entry wasn't. + assert_eq!(&*entry1, &e1); + + // Additional queries while holding the guards would cause the writer to block. + // We can't test this here since there's a single thread and it would block forever. + } +} + +// Loom is excluded: the left_right epoch state space is too large for +// exhaustive search to terminate. Shuttle's PortfolioRunner is the right +// tool at this dimensionality. The heavy fuzz tests in `mod tests` above +// stay on the std backend -- they're calibrated for TSAN, not for the +// per-iteration cost of model-checking. +#[cfg(not(feature = "loom"))] +mod concurrency_tests { + use crate::fib::fibtable::FibTableWriter; + use crate::fib::fibtype::FibKey; + + use concurrency::sync::Arc; + use concurrency::sync::atomic::{AtomicBool, Ordering}; + use concurrency::thread; + + use net::buffer::TestBuffer; + use net::ip::NextHeader; + use net::packet::Packet; + use net::packet::test_utils::build_test_ipv4_packet_with_transport; + + use lpm::prefix::IpAddr; + + use std::str::FromStr; + use std::time::Duration; + + fn test_packet() -> Packet { + let mut packet = build_test_ipv4_packet_with_transport(64, Some(NextHeader::UDP)).unwrap(); + let destination = IpAddr::from_str("192.168.1.1").expect("Bad dst ip address"); + packet.set_ip_destination(destination).unwrap(); + packet + } + + #[concurrency::test] fn test_fib_removals() { - // create fibtable (empty, without any fib) + const MAX_ITERATIONS: usize = cfg_select! { + any(feature = "loom", feature = "shuttle") => 5, + emulated => 50, + _ => 1000, + }; + // Cap the reader's polling so shuttle/loom don't hit `exceeded max_steps`. + const READER_BUDGET: usize = cfg_select! { + any(feature = "loom", feature = "shuttle") => MAX_ITERATIONS * 4, + _ => usize::MAX, + }; + let (mut fibtw, fibtr) = FibTableWriter::new(); let fibtrfactory = fibtr.factory(); let vrfid = 1; @@ -411,6 +498,7 @@ mod tests { let handle = thread::spawn(move || { let fibtr = fibtrfactory.handle(); let mut enters = 0; + let mut budget = READER_BUDGET; let packet = test_packet(); loop { if let Ok(fib) = fibtr.get_fib_reader(FibKey::Id(vrfid)) { @@ -419,7 +507,8 @@ mod tests { enters += 1; } } - if thread_stop.load(Ordering::Relaxed) { + budget = budget.saturating_sub(1); + if budget == 0 || thread_stop.load(Ordering::Relaxed) { println!("entered: {enters} times"); break; } @@ -428,11 +517,10 @@ mod tests { let mut iterations = 0; loop { - const MAX_ITERATIONS: usize = cfg_select! { - miri => 50, - _ => 1000, - }; let fibw = fibtw.add_fib(vrfid, None); + // No real elapsed time under loom/shuttle (sleep is a yield); on + // the std backend the 5 ms wait gives the reader thread room to + // run between writer ops. thread::sleep(Duration::from_millis(5)); fibtw.del_fib(vrfid, None); fibw.destroy(); @@ -446,18 +534,20 @@ mod tests { handle.join().unwrap(); } - // Minimal reproducer for the TSAN race reported on `test_concurrency_fibtable`. + // Minimal reproducer for the TSAN race reported on + // `test_concurrency_fibtable`. // // Uses `left_right` directly with a trivial struct that has no interior // allocations and no relationship to `Fib`, `FibTable`, or the // `ReadHandleCache` (no thread-local storage). If this test races under - // ThreadSanitizer, the bug lives in `left_right::WriteHandle::drop` (in - // `take_inner`: NULL-swap followed by `wait()` on stale `last_epochs`), + // ThreadSanitizer the bug lives in `left_right::WriteHandle::drop` + // (`take_inner`: NULL-swap followed by `wait()` on stale `last_epochs`), // *not* in our code. - #[test] + #[concurrency::test] fn test_leftright_destroy_race_simple() { + use concurrency::sync::RwLock; + use concurrency::thread::Builder; use left_right::{Absorb, ReadHandle, ReadHandleFactory}; - use std::sync::RwLock; #[derive(Default)] struct Tiny { @@ -479,11 +569,21 @@ mod tests { } } - const NUM_WORKERS: u16 = 6; + const NUM_WORKERS: u16 = cfg_select! { + any(feature = "loom", feature = "shuttle") => 2, + _ => 6, + }; const ITERATIONS: usize = cfg_select! { - miri => 50, + any(feature = "loom", feature = "shuttle") => 2, + emulated => 50, _ => 5_000, }; + // Bound the workers' polling so shuttle / loom don't hit + // `exceeded max_steps` on the spin loop. + const WORKER_BUDGET: usize = cfg_select! { + any(feature = "loom", feature = "shuttle") => ITERATIONS * 4, + _ => usize::MAX, + }; // Shared, lock-protected factory (or None) that writer populates with a new factory // anytime a new write handle is created and which workers use to get fresh handles. @@ -500,9 +600,10 @@ mod tests { .spawn(move || { let mut enters = 0u64; let mut misses = 0u64; + let mut budget = WORKER_BUDGET; loop { let rh: Option> = - slot.read().unwrap().as_ref().map(ReadHandleFactory::handle); + slot.read().as_ref().map(ReadHandleFactory::handle); if let Some(rh) = rh { match rh.enter() { Some(g) => { @@ -517,7 +618,8 @@ mod tests { None => misses += 1, } } - if stop.load(Ordering::Relaxed) { + budget = budget.saturating_sub(1); + if budget == 0 || stop.load(Ordering::Relaxed) { break; } } @@ -531,7 +633,7 @@ mod tests { let (mut w, r) = left_right::new::(); // Publish the factory so workers can get handles - *factory.write().unwrap() = Some(r.factory()); + *factory.write() = Some(r.factory()); drop(r); // Let workers race with the upcoming drop. `yield_now` is enough; @@ -545,58 +647,11 @@ mod tests { // Remove the factory so that no further handles can be created. // Workers already holding a read handle should get a None when // attempting to `enter()`. - *factory.write().unwrap() = None; + *factory.write() = None; } stop.store(true, Ordering::Relaxed); for h in handles { h.join().unwrap(); } } - - // Tests fib reader utilities returning guards - #[test] - fn test_fib_guards() { - // create fib - let (mut fibw, fibr) = FibWriter::new(FibKey::Id(0)); - - // add a route - let prefix = Prefix::from("192.168.1.0/24"); - let nhkey = NhopKey::with_address(&IpAddr::from_str("7.0.0.1").unwrap()); - let e1 = build_fib_entry_egress(1, "10.0.1.1", "eth1"); - let fibgroup1 = build_fibgroup(std::slice::from_ref(&e1)); - fibw.register_fibgroup(&nhkey, &fibgroup1, false); - fibw.add_fibroute(prefix, vec![nhkey.clone()], false); - fibw.publish(); - - // use the fib: do lpm for some destination and get a route - let destination = IpAddr::from_str("192.168.1.1").expect("Bad dst ip address"); - let route = &*fibr.lpm_route(destination).unwrap(); - assert!(route.has_entries()); - assert_eq!(route.len(), fibgroup1.len()); - assert!(route.iter().any(|g| g == &fibgroup1)); - - // use again the fib with a packet - let packet = test_packet(); - let (matched, entry1) = fibr.lpm_entry_prefix(&packet).unwrap(); - assert_eq!(matched, prefix); - assert!(fibgroup1.entries().contains(&*entry1)); - - // attempt to modify the route by modifying the fibgroup - let e2 = build_fib_entry_egress(2, "10.0.2.1", "eth2"); - let fibgroup2 = build_fibgroup(std::slice::from_ref(&e2)); - fibw.register_fibgroup(&nhkey, &fibgroup2, false); - fibw.add_fibroute(prefix, vec![nhkey.clone()], false); - fibw.publish(); - - // a second query to the fib yields the updated value. This is counter-intuitive but correct, - // because of the number of readers we have and the fact that the original fib guard was forgotten. - let (_, entry2) = fibr.lpm_entry_prefix(&packet).unwrap(); - assert_eq!(&*entry2, &e2); - - // ... but the guard to the entry wasn't. - assert_eq!(&*entry1, &e1); - - // Additional queries while holding the guards would cause the writer to block. - // We can't test this here since there's a single thread and it would block forever. - } } diff --git a/routing/src/frr/test.rs b/routing/src/frr/test.rs index 78973fb9b1..d83322e6fb 100644 --- a/routing/src/frr/test.rs +++ b/routing/src/frr/test.rs @@ -130,11 +130,11 @@ pub mod tests { use tokio::sync::RwLock; use tracing_test::traced_test; - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] #[tokio::test] #[cfg_attr( - miri, - ignore = "binds Unix domain sockets at /tmp/*.sock for the fake FRR agent" + emulated, + ignore = "binds Unix domain sockets; miri has no UDS, qemu-user has flaky epoll readiness" )] async fn test_fake_frr_agent() { let dp_status: Arc> = Arc::new(RwLock::new(DataplaneStatus::new())); diff --git a/routing/src/rib/nexthop.rs b/routing/src/rib/nexthop.rs index 47ff76ff8c..53dddaeafb 100644 --- a/routing/src/rib/nexthop.rs +++ b/routing/src/rib/nexthop.rs @@ -419,7 +419,7 @@ mod tests { use std::rc::Rc; use tracing_test::traced_test; - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] #[test] /// Tests the correct behavior of the next-hop store fn test_nhop_store_minimal() { @@ -445,7 +445,7 @@ mod tests { store.dump(); } - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] #[test] /// Tests the correct deletion of next-hops fn test_nhop_reuse_and_deletion() { @@ -482,7 +482,7 @@ mod tests { assert_eq!(store.len(), 0); } - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] #[test] /// Tests the correctness of next-hops reference counts fn test_nhop_ref_counts() { @@ -533,7 +533,7 @@ mod tests { store.dump(); } - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] #[test] fn test_nhop_store_shared_resolvers() { let mut store = NhopStore::new(); @@ -576,7 +576,7 @@ mod tests { assert!(store.is_empty()); } - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] #[test] /// Tests flushing of next-hop resolution data fn test_nhop_store_flush_resolvers() { @@ -701,7 +701,7 @@ mod tests { store } - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] #[test] fn test_nhop_store_consistency() { /* create store */ @@ -811,7 +811,7 @@ mod tests { println!("{:#?}", &res); } - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] #[test] fn test_loop_prevention() { let mut store = NhopStore::new(); diff --git a/routing/src/rib/vrftable.rs b/routing/src/rib/vrftable.rs index a2749909d6..4b80d68fcd 100644 --- a/routing/src/rib/vrftable.rs +++ b/routing/src/rib/vrftable.rs @@ -439,7 +439,7 @@ mod tests { vni.try_into().expect("Bad vni") } - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] #[test] fn vrf_table_basic() { /* create fib table */ @@ -628,7 +628,7 @@ mod tests { println!("{vrftable}"); } - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] #[test] fn vrf_table_vnis() { debug!("━━━━Test: Create vrf table"); @@ -686,7 +686,7 @@ mod tests { } } - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] #[test] fn vrf_table_deletions() { debug!("━━━━Test: Create testing interface table"); @@ -919,7 +919,7 @@ mod tests { test_vrf_fibgroup(build_test_vrf()); } - #[traced_test] + #[cfg_attr(not(emulated), traced_test)] #[test] fn test_vrf_fibgroup_2() { test_vrf_fibgroup(build_test_vrf_nhops_partially_resolved()); diff --git a/routing/src/router/ctl.rs b/routing/src/router/ctl.rs index 4fb6fd2f10..3fda28e73e 100644 --- a/routing/src/router/ctl.rs +++ b/routing/src/router/ctl.rs @@ -3,9 +3,9 @@ //! Control channel for the router +use concurrency::sync::Arc; use config::{GwConfigMeta, ValidatedGwConfig}; use mio::Interest; -use std::sync::Arc; use tokio::sync::mpsc::Sender; use tokio::sync::mpsc::error::TryRecvError; use tokio::sync::oneshot; diff --git a/routing/src/router/rio.rs b/routing/src/router/rio.rs index 9bde261120..6916adb8f1 100644 --- a/routing/src/router/rio.rs +++ b/routing/src/router/rio.rs @@ -26,12 +26,12 @@ use dplane_rpc::socks::RpcCachedSock; use mio::unix::SourceFd; use mio::{Events, Interest, Poll, Token}; +use concurrency::sync::Arc; use nix::sys::socket::{getsockopt, setsockopt, sockopt::SndBuf}; use std::fs; use std::os::fd::AsRawFd; use std::os::unix::fs::PermissionsExt; use std::os::unix::net::UnixDatagram; -use std::sync::Arc; use std::thread::{self, JoinHandle}; use std::time::{Duration, Instant}; use tokio::sync::mpsc::{Receiver, Sender, channel}; @@ -483,7 +483,7 @@ mod tests { use std::time::Duration; #[test] - #[cfg_attr(miri, ignore = "binds Unix domain sockets at /tmp/hh_*.sock")] + #[cfg_attr(emulated, ignore = "binds Unix domain sockets at /tmp/hh_*.sock")] fn test_rio_ctl() { let cpi_bind_addr = "/tmp/hh_dataplane.sock".to_string(); let cli_bind_addr = "/tmp/hh_cli.sock".to_string(); @@ -513,7 +513,7 @@ mod tests { assert_eq!(cpi.finish(), Ok(())); } #[test] - #[cfg_attr(miri, ignore = "exercises Unix domain socket bind paths")] + #[cfg_attr(emulated, ignore = "exercises Unix domain socket bind paths")] fn test_rio_bad_path() { /* Build rio configuration with bad path for unix sock */ let conf = RioConf { diff --git a/scripts/test-runner.sh b/scripts/test-runner.sh new file mode 100755 index 0000000000..9c0ccc4fd1 --- /dev/null +++ b/scripts/test-runner.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash + +# SPDX-License-Identifier: Apache-2.0 +# Copyright Open Network Fabric Authors + +set -euo pipefail +set -o allexport + +if [ -n "${MIRI_SYSROOT:-}" ]; then + declare miri_wrapper + miri_wrapper="$(dirname "${CARGO:-cargo}")/.cargo-miri-wrapped" + declare -r miri_wrapper + if [ -x "${miri_wrapper}" ]; then + exec "${miri_wrapper}" runner "${@}" + else + echo "test-runner.sh: MIRI_SYSROOT is set but ${miri_wrapper} is not an executable file" >&2 + exit 1 + fi +fi + +declare host_machine +host_machine="$(uname --machine)" +declare -r host_machine + +declare host_kernel_name +host_kernel_name="$(uname --kernel-name)" +host_kernel_name="${host_kernel_name,,}" # convert to lower case +declare -r host_kernel_name + +declare -r target_machine="${1}" +shift + +if [ "${host_machine}" = "${target_machine}" ] && [ "${host_kernel_name}" = "linux" ]; then + exec "${@}" +else + exec "qemu-${target_machine}" "${@}" +fi diff --git a/sysfs/Cargo.toml b/sysfs/Cargo.toml index 4c73687c4d..6ce8d448b0 100644 --- a/sysfs/Cargo.toml +++ b/sysfs/Cargo.toml @@ -7,6 +7,7 @@ version.workspace = true [dependencies] # internal +concurrency = { workspace = true } # external nix = { workspace = true, features = ["mount", "fs"] } diff --git a/sysfs/src/lib.rs b/sysfs/src/lib.rs index 9b80e2482d..25497fdadf 100644 --- a/sysfs/src/lib.rs +++ b/sysfs/src/lib.rs @@ -8,9 +8,9 @@ //! //! [sysfs]: https://www.kernel.org/doc/Documentation/filesystems/sysfs.txt +use concurrency::sync::LazyLock; use std::os::fd::AsFd; use std::path::{Path, PathBuf}; -use std::sync::LazyLock; use tracing::{error, info}; diff --git a/tracectl/Cargo.toml b/tracectl/Cargo.toml index 9a1fac2fac..909d31d68a 100644 --- a/tracectl/Cargo.toml +++ b/tracectl/Cargo.toml @@ -8,6 +8,7 @@ version.workspace = true [dependencies] color-eyre = { workspace = true , features = [ "capture-spantrace", "color-spantrace", "tracing-error", "track-caller" ] } common = { workspace = true } +concurrency = { workspace = true } linkme = { workspace = true } ordermap = { workspace = true, features = ["std"] } thiserror = { workspace = true } diff --git a/tracectl/src/control.rs b/tracectl/src/control.rs index 08aa6f71e5..a78c5d22f3 100644 --- a/tracectl/src/control.rs +++ b/tracectl/src/control.rs @@ -3,10 +3,10 @@ //! Tracing runtime control. +use concurrency::sync::{Arc, LazyLock, Mutex, MutexGuard}; use ordermap::OrderMap; +use std::collections::HashSet; use std::str::FromStr; -use std::sync::{Arc, LazyLock, Mutex}; -use std::{collections::HashSet, sync::MutexGuard}; use thiserror::Error; #[allow(unused)] use tracing::{debug, error, info, warn}; @@ -312,11 +312,9 @@ impl TracingControl { reload_filter: Arc::new(reload_filter), } } - /// This method should remain private and never be used other than from methods of `TracingControl` - fn lock(&self) -> Result, TraceCtlError> { - self.db - .lock() - .map_err(|e| TraceCtlError::LockFailure(e.to_string())) + /// This method should remain private and never be used other than from methods of `TracingControl`. + fn lock(&self) -> MutexGuard<'_, TargetCfgDb> { + self.db.lock() } fn reload(&self, filter: EnvFilter) -> Result<(), TraceCtlError> { self.reload_filter @@ -332,7 +330,7 @@ impl TracingControl { tags: &'static [&'static str], custom: bool, ) { - let mut db = self.lock().unwrap(); + let mut db = self.lock(); db.register(target, name, level, tags, custom); self.reload(db.env_filter()).unwrap(); } @@ -352,7 +350,7 @@ impl TracingControl { let _ = get_trace_ctl(); } fn set_tag_level(&self, tag: &str, level: LevelFilter) -> Result<(), TraceCtlError> { - let mut db = self.lock()?; + let mut db = self.lock(); let changed = db.set_tag_level(tag, level)?; if changed > 0 { self.reload(db.env_filter())?; @@ -361,7 +359,7 @@ impl TracingControl { Ok(()) } pub fn set_default_level(&self, level: LevelFilter) -> Result<(), TraceCtlError> { - let mut db = self.lock()?; + let mut db = self.lock(); if db.default != level { info!("Changing default log-level from {} to {level}", db.default); db.default = level; @@ -370,7 +368,7 @@ impl TracingControl { Ok(()) } pub fn get_default_level(&self) -> Result { - let db = self.lock()?; + let db = self.lock(); Ok(db.default) } @@ -409,27 +407,27 @@ impl TracingControl { #[cfg(test)] #[allow(clippy::missing_panics_doc)] pub fn get_tags(&self) -> impl Iterator { - self.db.lock().unwrap().tags.clone().into_values() + self.db.lock().tags.clone().into_values() } #[cfg(test)] #[allow(clippy::missing_panics_doc)] #[must_use] pub fn get_tag(&self, tag: &str) -> Option { - self.db.lock().unwrap().tags.get(tag).cloned() + self.db.lock().tags.get(tag).cloned() } #[cfg(test)] #[allow(clippy::missing_panics_doc)] #[must_use] pub fn get_target(&self, target: &str) -> Option { - self.db.lock().unwrap().targets.get(target).cloned() + self.db.lock().targets.get(target).cloned() } #[cfg(test)] #[allow(clippy::missing_panics_doc)] pub fn get_targets_by_tag(&self, tag: &str) -> impl Iterator { - let db = self.db.lock().unwrap(); + let db = self.db.lock(); db.tag_targets(tag) .map(|x| (*x).clone()) .collect::>() @@ -437,7 +435,7 @@ impl TracingControl { } pub fn as_config_string(&self) -> Result { - Ok(self.lock()?.as_config_string()) + Ok(self.lock().as_config_string()) } fn reconfigure_internal<'a>( @@ -446,7 +444,7 @@ impl TracingControl { tag_config: impl Iterator, resolver: &dyn Resolver, ) -> Result<(), TraceCtlError> { - let mut db = self.lock()?; + let mut db = self.lock(); let changed = db.reconfigure(default, tag_config, resolver); if changed > 0 { self.reload_filter @@ -466,13 +464,13 @@ impl TracingControl { self.reconfigure_internal(default, tag_config, &ResolveByTagSize) } pub fn check_tags(&self, tags: &[&str]) -> Result<(), TraceCtlError> { - self.lock()?.check_tags(tags) + self.lock().check_tags(tags) } pub fn as_string(&self) -> Result { - Ok(self.lock()?.to_string()) + Ok(self.lock().to_string()) } pub fn as_string_by_tag(&self) -> Result { - let db = self.lock()?; + let db = self.lock(); Ok(TargetCfgDbByTag(&db).to_string()) } } @@ -521,7 +519,7 @@ mod tests { "The current default loglevel is {}", tctl.get_default_level().unwrap() ); - println!("{:#?}", tctl.db.lock().unwrap()); + println!("{:#?}", tctl.db.lock()); } #[test] @@ -586,12 +584,12 @@ mod tests { // check target presence in database: all should be there even if defined later let tctl = get_trace_ctl(); - assert!(tctl.db.lock().unwrap().targets.contains_key(module_path!())); - assert!(tctl.db.lock().unwrap().targets.contains_key("target-1")); - assert!(tctl.db.lock().unwrap().targets.contains_key("target-2")); - assert!(tctl.db.lock().unwrap().targets.contains_key("target-3")); - assert!(tctl.db.lock().unwrap().targets.contains_key("target-4")); - assert!(tctl.db.lock().unwrap().targets.contains_key("func1")); + assert!(tctl.db.lock().targets.contains_key(module_path!())); + assert!(tctl.db.lock().targets.contains_key("target-1")); + assert!(tctl.db.lock().targets.contains_key("target-2")); + assert!(tctl.db.lock().targets.contains_key("target-3")); + assert!(tctl.db.lock().targets.contains_key("target-4")); + assert!(tctl.db.lock().targets.contains_key("func1")); // this is declared after the checks custom_target!("target-4", LevelFilter::OFF, &["target-4"]); @@ -626,7 +624,7 @@ mod tests { custom_target!(TARGET, LevelFilter::TRACE, &[]); let tctl = get_trace_ctl(); - assert!(tctl.db.lock().unwrap().targets.contains_key(TARGET)); + assert!(tctl.db.lock().targets.contains_key(TARGET)); let target = tctl.get_target(TARGET).expect("Should be found"); assert_eq!(target.level, LevelFilter::TRACE);