From c2ec4acc7a6632473044c923394ab35400bd9ee2 Mon Sep 17 00:00:00 2001 From: Flakebi Date: Tue, 24 Mar 2026 10:58:31 +0100 Subject: [PATCH 1/2] Document convergent limitation on intrinsics --- .../src/amdgpu/intrinsic_is_convergent.md | 6 ++ crates/core_arch/src/amdgpu/mod.rs | 72 +++++++++++++++++++ crates/core_arch/src/nvptx/mod.rs | 2 + 3 files changed, 80 insertions(+) create mode 100644 crates/core_arch/src/amdgpu/intrinsic_is_convergent.md diff --git a/crates/core_arch/src/amdgpu/intrinsic_is_convergent.md b/crates/core_arch/src/amdgpu/intrinsic_is_convergent.md new file mode 100644 index 0000000000..1bc8899d33 --- /dev/null +++ b/crates/core_arch/src/amdgpu/intrinsic_is_convergent.md @@ -0,0 +1,6 @@ +This intrinsic does not behave like a normal function call; it is a "[convergent]" operation and as such has non-standard control-flow effects which need special treatment by the language. +Rust currently does not properly support convergent operations. +This operation is hence provided on a best-effort basis. +Using it may result in incorrect code under some circumstances. + +[convergent]: https://llvm.org/docs/ConvergentOperations.html diff --git a/crates/core_arch/src/amdgpu/mod.rs b/crates/core_arch/src/amdgpu/mod.rs index 40274e4d79..374f582696 100644 --- a/crates/core_arch/src/amdgpu/mod.rs +++ b/crates/core_arch/src/amdgpu/mod.rs @@ -244,6 +244,8 @@ pub fn wavefrontsize() -> u32 { /// Synchronize all wavefronts in a workgroup. /// /// Each wavefronts in a workgroup waits at the barrier until all wavefronts in the workgroup arrive at a barrier. +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub fn s_barrier() { @@ -253,6 +255,8 @@ pub fn s_barrier() { /// Signal a specific barrier type. /// /// Only for non-named barriers. +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub unsafe fn s_barrier_signal() { @@ -265,6 +269,8 @@ pub unsafe fn s_barrier_signal() { /// Provides access to the s_barrier_signal_first instruction; /// additionally ensures that the result value is valid even when /// the intrinsic is used from a wavefront that is not running in a workgroup. +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub unsafe fn s_barrier_signal_isfirst() -> bool { @@ -274,6 +280,8 @@ pub unsafe fn s_barrier_signal_isfirst() -> bool { /// Wait for a specific barrier type. /// /// Only for non-named barriers. +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub unsafe fn s_barrier_wait() { @@ -283,6 +291,8 @@ pub unsafe fn s_barrier_wait() { /// Get the state of a specific barrier type. /// /// The `barrier_type` argument must be uniform, otherwise behavior is undefined. +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub unsafe fn s_get_barrier_state() -> u32 { @@ -292,6 +302,8 @@ pub unsafe fn s_get_barrier_state() -> u32 { /// A barrier for only the threads within the current wavefront. /// /// Does not result in an instruction but restricts the compiler. +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub fn wave_barrier() { @@ -315,6 +327,8 @@ pub fn wave_barrier() { /// - 0x0100: All DS read instructions may be scheduled across `sched_barrier`. /// - 0x0200: All DS write instructions may be scheduled across `sched_barrier`. /// - 0x0400: All Transcendental (e.g. V_EXP) instructions may be scheduled across `sched_barrier`. +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub unsafe fn sched_barrier() { @@ -345,6 +359,8 @@ pub unsafe fn sched_barrier() { /// // 5 MFMA /// sched_group_barrier::<8, 5, 0>() /// ``` +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub unsafe fn sched_group_barrier() { @@ -366,6 +382,8 @@ pub fn s_sleep() { /// Stop execution of the kernel. /// /// This usually signals an error state. +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub fn s_sethalt() -> ! { @@ -407,6 +425,8 @@ pub fn mbcnt_hi(value: u32, init: u32) -> u32 { /// Returns a bitfield (`u32` or `u64`) containing the result of its i1 argument /// in all active lanes, and zero in all inactive lanes. +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub fn ballot(b: bool) -> u64 { @@ -419,6 +439,8 @@ pub fn ballot(b: bool) -> u64 { /// While [`ballot`] converts a `bool` to a mask, `inverse_ballot` converts a mask back to a `bool`. /// This means `inverse_ballot(ballot(b)) == b`. /// The inverse of `ballot(inverse_ballot(value)) ~= value` is not always true as inactive lanes are set to zero by `ballot`. +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub fn inverse_ballot(value: u64) -> bool { @@ -433,6 +455,8 @@ pub fn inverse_ballot(value: u64) -> bool { /// - 2: DPP /// /// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy. +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub fn wave_reduce_umin(value: u32) -> u32 { @@ -447,6 +471,8 @@ pub fn wave_reduce_umin(value: u32) -> u32 { /// - 2: DPP /// /// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy. +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub fn wave_reduce_min(value: i32) -> i32 { @@ -462,6 +488,8 @@ pub fn wave_reduce_min(value: i32) -> i32 { /// - 2: DPP /// /// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy. +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub fn wave_reduce_umax(value: u32) -> u32 { @@ -476,6 +504,8 @@ pub fn wave_reduce_umax(value: u32) -> u32 { /// - 2: DPP /// /// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy. +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub fn wave_reduce_max(value: i32) -> i32 { @@ -491,6 +521,8 @@ pub fn wave_reduce_max(value: i32) -> i32 { /// - 2: DPP /// /// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy. +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub fn wave_reduce_add(value: u32) -> u32 { @@ -506,6 +538,8 @@ pub fn wave_reduce_add(value: u32) -> u32 { /// - 2: DPP /// /// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy. +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub fn wave_reduce_and(value: u32) -> u32 { @@ -520,6 +554,8 @@ pub fn wave_reduce_and(value: u32) -> u32 { /// - 2: DPP /// /// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy. +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub fn wave_reduce_or(value: u32) -> u32 { @@ -534,6 +570,8 @@ pub fn wave_reduce_or(value: u32) -> u32 { /// - 2: DPP /// /// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy. +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub fn wave_reduce_xor(value: u32) -> u32 { @@ -544,12 +582,16 @@ pub fn wave_reduce_xor(value: u32) -> u32 { // The following intrinsics can have multiple sizes /// Get `value` from the first active lane in the wavefront. +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub fn readfirstlane_u32(value: u32) -> u32 { llvm_readfirstlane_u32(value) } /// Get `value` from the first active lane in the wavefront. +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub fn readfirstlane_u64(value: u64) -> u64 { @@ -559,6 +601,8 @@ pub fn readfirstlane_u64(value: u64) -> u64 { /// /// The lane argument must be uniform across the currently active threads /// of the current wavefront. Otherwise, the result is undefined. +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub unsafe fn readlane_u32(value: u32, lane: u32) -> u32 { @@ -568,6 +612,8 @@ pub unsafe fn readlane_u32(value: u32, lane: u32) -> u32 { /// /// The lane argument must be uniform across the currently active threads /// of the current wavefront. Otherwise, the result is undefined. +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub unsafe fn readlane_u64(value: u64, lane: u32) -> u64 { @@ -582,6 +628,8 @@ pub unsafe fn readlane_u64(value: u64, lane: u32) -> u64 { /// /// `value` is the value returned by `lane`. /// `default` is the value returned by all lanes other than `lane`. +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub unsafe fn writelane_u32(value: u32, lane: u32, default: u32) -> u32 { @@ -596,6 +644,8 @@ pub unsafe fn writelane_u32(value: u32, lane: u32, default: u32) -> u32 { /// /// `value` is the value returned by `lane`. /// `default` is the value returned by all lanes other than `lane`. +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub unsafe fn writelane_u64(value: u64, lane: u32, default: u64) -> u64 { @@ -605,6 +655,8 @@ pub unsafe fn writelane_u64(value: u64, lane: u32, default: u64) -> u64 { /// Stop execution of the wavefront. /// /// This usually signals the end of a successful execution. +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub fn endpgm() -> ! { @@ -621,6 +673,8 @@ pub fn endpgm() -> ! { /// v_mov_b32 /// v_mov_b32 /// ``` +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub unsafe fn update_dpp< @@ -651,6 +705,8 @@ pub fn s_memrealtime() -> u64 { /// /// Reading from inactive lanes returns `0`. /// In case multiple values get written to the same `lane`, the value from the source lane with the higher index is taken. +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub unsafe fn ds_permute(lane: u32, value: u32) -> u32 { @@ -661,6 +717,8 @@ pub unsafe fn ds_permute(lane: u32, value: u32) -> u32 { /// Returns the `value` given to `ds_permute` by lane `lane`. /// /// Reading from inactive lanes returns `0`. +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub unsafe fn ds_bpermute(lane: u32, value: u32) -> u32 { @@ -680,6 +738,8 @@ pub unsafe fn perm(src0: u32, src1: u32, selector: u32) -> u32 { /// /// The third and fourth inputs must be uniform across the current wavefront. /// These are combined into a single 64-bit value representing lane selects used to swizzle within each row. +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub unsafe fn permlane16_u32( @@ -696,6 +756,8 @@ pub unsafe fn permlane16_u32( /// /// The third and fourth inputs must be uniform across the current wavefront. /// These are combined into a single 64-bit value representing lane selects used to swizzle within each row. +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub unsafe fn permlanex16_u32( @@ -718,6 +780,8 @@ pub fn s_get_waveid_in_workgroup() -> u32 { /// Swap `value` between upper and lower 32 lanes in a wavefront. /// /// Does nothing for wave32. +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub unsafe fn permlane64_u32(value: u32) -> u32 { @@ -728,6 +792,8 @@ pub unsafe fn permlane64_u32(value: u32) -> u32 { /// Performs arbitrary gather-style operation within a row (16 contiguous lanes) of the second input operand. /// /// In contrast to [`permlane16_u32`], allows each lane to specify its own gather lane. +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub unsafe fn permlane16_var( @@ -742,6 +808,8 @@ pub unsafe fn permlane16_var( /// Performs arbitrary gather-style operation across two rows (16 contiguous lanes) of the second input operand. /// /// In contrast to [`permlanex16_u32`], allows each lane to specify its own gather lane. +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub unsafe fn permlanex16_var( @@ -766,6 +834,8 @@ pub fn wave_id() -> u32 { /// Odd rows of the first operand are swapped with even rows of the second operand (one row is 16 lanes). /// Returns a pair for the swapped registers. /// The first element of the return corresponds to the swapped element of the first argument. +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub unsafe fn permlane16_swap( @@ -782,6 +852,8 @@ pub unsafe fn permlane16_swap( /// Rows 2 and 3 of the first operand are swapped with rows 0 and 1 of the second operand (one row is 16 lanes). /// Returns a pair for the swapped registers. /// The first element of the return corresponds to the swapped element of the first argument. +/// +#[doc = include_str!("intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_amdgpu", issue = "149988")] pub unsafe fn permlane32_swap( diff --git a/crates/core_arch/src/nvptx/mod.rs b/crates/core_arch/src/nvptx/mod.rs index 5471ef8198..b63a5d01a7 100644 --- a/crates/core_arch/src/nvptx/mod.rs +++ b/crates/core_arch/src/nvptx/mod.rs @@ -49,6 +49,8 @@ unsafe extern "C" { } /// Synchronizes all threads in the block. +/// +#[doc = include_str!("../amdgpu/intrinsic_is_convergent.md")] #[inline] #[unstable(feature = "stdarch_nvptx", issue = "111199")] pub unsafe fn _syncthreads() -> () { From 4ba4f077b8f306b21aa10d78fc0f1359d745bc6f Mon Sep 17 00:00:00 2001 From: The rustc-josh-sync Cronjob Bot Date: Mon, 4 May 2026 05:12:53 +0000 Subject: [PATCH 2/2] Prepare for merging from rust-lang/rust This updates the rust-version file to 045b17737dab5fcc28e4cbee0cfe2ce4ed363b32. --- rust-version | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust-version b/rust-version index e9fc6c4cd0..59e9e5a0e6 100644 --- a/rust-version +++ b/rust-version @@ -1 +1 @@ -e22c616e4e87914135c1db261a03e0437255335e +045b17737dab5fcc28e4cbee0cfe2ce4ed363b32