From c2ec4acc7a6632473044c923394ab35400bd9ee2 Mon Sep 17 00:00:00 2001
From: Flakebi <flakebi@t-online.de>
Date: Tue, 24 Mar 2026 10:58:31 +0100
Subject: [PATCH 1/2] Document convergent limitation on intrinsics

---
 .../src/amdgpu/intrinsic_is_convergent.md     |  6 ++
 crates/core_arch/src/amdgpu/mod.rs            | 72 +++++++++++++++++++
 crates/core_arch/src/nvptx/mod.rs             |  2 +
 3 files changed, 80 insertions(+)
 create mode 100644 crates/core_arch/src/amdgpu/intrinsic_is_convergent.md
diff --git a/crates/core_arch/src/amdgpu/intrinsic_is_convergent.md b/crates/core_arch/src/amdgpu/intrinsic_is_convergent.md
new file mode 100644
index 0000000000..1bc8899d33
--- /dev/null
+++ b/crates/core_arch/src/amdgpu/intrinsic_is_convergent.md
@@ -0,0 +1,6 @@
+This intrinsic does not behave like a normal function call; it is a "[convergent]" operation and as such has non-standard control-flow effects which need special treatment by the language.
+Rust currently does not properly support convergent operations.
+This operation is hence provided on a best-effort basis.
+Using it may result in incorrect code under some circumstances.
+
+[convergent]: https://llvm.org/docs/ConvergentOperations.html
diff --git a/crates/core_arch/src/amdgpu/mod.rs b/crates/core_arch/src/amdgpu/mod.rs
index 40274e4d79..374f582696 100644
--- a/crates/core_arch/src/amdgpu/mod.rs
+++ b/crates/core_arch/src/amdgpu/mod.rs
@@ -244,6 +244,8 @@ pub fn wavefrontsize() -> u32 {
 /// Synchronize all wavefronts in a workgroup.
 ///
 /// Each wavefronts in a workgroup waits at the barrier until all wavefronts in the workgroup arrive at a barrier.
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub fn s_barrier() {
@@ -253,6 +255,8 @@ pub fn s_barrier() {
 /// Signal a specific barrier type.
 ///
 /// Only for non-named barriers.
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub unsafe fn s_barrier_signal<const BARRIER_TYPE: i32>() {
@@ -265,6 +269,8 @@ pub unsafe fn s_barrier_signal<const BARRIER_TYPE: i32>() {
 /// Provides access to the s_barrier_signal_first instruction;
 /// additionally ensures that the result value is valid even when
 /// the intrinsic is used from a wavefront that is not running in a workgroup.
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub unsafe fn s_barrier_signal_isfirst<const BARRIER_TYPE: i32>() -> bool {
@@ -274,6 +280,8 @@ pub unsafe fn s_barrier_signal_isfirst<const BARRIER_TYPE: i32>() -> bool {
 /// Wait for a specific barrier type.
 ///
 /// Only for non-named barriers.
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub unsafe fn s_barrier_wait<const BARRIER_TYPE: i16>() {
@@ -283,6 +291,8 @@ pub unsafe fn s_barrier_wait<const BARRIER_TYPE: i16>() {
 /// Get the state of a specific barrier type.
 ///
 /// The `barrier_type` argument must be uniform, otherwise behavior is undefined.
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub unsafe fn s_get_barrier_state<const BARRIER_TYPE: i32>() -> u32 {
@@ -292,6 +302,8 @@ pub unsafe fn s_get_barrier_state<const BARRIER_TYPE: i32>() -> u32 {
 /// A barrier for only the threads within the current wavefront.
 ///
 /// Does not result in an instruction but restricts the compiler.
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub fn wave_barrier() {
@@ -315,6 +327,8 @@ pub fn wave_barrier() {
 /// - 0x0100: All DS read instructions may be scheduled across `sched_barrier`.
 /// - 0x0200: All DS write instructions may be scheduled across `sched_barrier`.
 /// - 0x0400: All Transcendental (e.g. V_EXP) instructions may be scheduled across `sched_barrier`.
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub unsafe fn sched_barrier<const MASK: u32>() {
@@ -345,6 +359,8 @@ pub unsafe fn sched_barrier<const MASK: u32>() {
 /// // 5 MFMA
 /// sched_group_barrier::<8, 5, 0>()
 /// ```
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub unsafe fn sched_group_barrier<const MASK: u32, const SIZE: u32, const SYNC_ID: u32>() {
@@ -366,6 +382,8 @@ pub fn s_sleep<const COUNT: u32>() {
 /// Stop execution of the kernel.
 ///
 /// This usually signals an error state.
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub fn s_sethalt<const VALUE: u32>() -> ! {
@@ -407,6 +425,8 @@ pub fn mbcnt_hi(value: u32, init: u32) -> u32 {
 
 /// Returns a bitfield (`u32` or `u64`) containing the result of its i1 argument
 /// in all active lanes, and zero in all inactive lanes.
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub fn ballot(b: bool) -> u64 {
@@ -419,6 +439,8 @@ pub fn ballot(b: bool) -> u64 {
 /// While [`ballot`] converts a `bool` to a mask, `inverse_ballot` converts a mask back to a `bool`.
 /// This means `inverse_ballot(ballot(b)) == b`.
 /// The inverse of `ballot(inverse_ballot(value)) ~= value` is not always true as inactive lanes are set to zero by `ballot`.
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub fn inverse_ballot(value: u64) -> bool {
@@ -433,6 +455,8 @@ pub fn inverse_ballot(value: u64) -> bool {
 /// - 2: DPP
 ///
 /// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy.
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub fn wave_reduce_umin<const STRATEGY: u32>(value: u32) -> u32 {
@@ -447,6 +471,8 @@ pub fn wave_reduce_umin<const STRATEGY: u32>(value: u32) -> u32 {
 /// - 2: DPP
 ///
 /// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy.
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub fn wave_reduce_min<const STRATEGY: u32>(value: i32) -> i32 {
@@ -462,6 +488,8 @@ pub fn wave_reduce_min<const STRATEGY: u32>(value: i32) -> i32 {
 /// - 2: DPP
 ///
 /// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy.
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub fn wave_reduce_umax<const STRATEGY: u32>(value: u32) -> u32 {
@@ -476,6 +504,8 @@ pub fn wave_reduce_umax<const STRATEGY: u32>(value: u32) -> u32 {
 /// - 2: DPP
 ///
 /// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy.
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub fn wave_reduce_max<const STRATEGY: u32>(value: i32) -> i32 {
@@ -491,6 +521,8 @@ pub fn wave_reduce_max<const STRATEGY: u32>(value: i32) -> i32 {
 /// - 2: DPP
 ///
 /// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy.
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub fn wave_reduce_add<const STRATEGY: u32>(value: u32) -> u32 {
@@ -506,6 +538,8 @@ pub fn wave_reduce_add<const STRATEGY: u32>(value: u32) -> u32 {
 /// - 2: DPP
 ///
 /// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy.
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub fn wave_reduce_and<const STRATEGY: u32>(value: u32) -> u32 {
@@ -520,6 +554,8 @@ pub fn wave_reduce_and<const STRATEGY: u32>(value: u32) -> u32 {
 /// - 2: DPP
 ///
 /// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy.
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub fn wave_reduce_or<const STRATEGY: u32>(value: u32) -> u32 {
@@ -534,6 +570,8 @@ pub fn wave_reduce_or<const STRATEGY: u32>(value: u32) -> u32 {
 /// - 2: DPP
 ///
 /// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy.
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub fn wave_reduce_xor<const STRATEGY: u32>(value: u32) -> u32 {
@@ -544,12 +582,16 @@ pub fn wave_reduce_xor<const STRATEGY: u32>(value: u32) -> u32 {
 // The following intrinsics can have multiple sizes
 
 /// Get `value` from the first active lane in the wavefront.
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub fn readfirstlane_u32(value: u32) -> u32 {
     llvm_readfirstlane_u32(value)
 }
 /// Get `value` from the first active lane in the wavefront.
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub fn readfirstlane_u64(value: u64) -> u64 {
@@ -559,6 +601,8 @@ pub fn readfirstlane_u64(value: u64) -> u64 {
 ///
 /// The lane argument must be uniform across the currently active threads
 /// of the current wavefront. Otherwise, the result is undefined.
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub unsafe fn readlane_u32(value: u32, lane: u32) -> u32 {
@@ -568,6 +612,8 @@ pub unsafe fn readlane_u32(value: u32, lane: u32) -> u32 {
 ///
 /// The lane argument must be uniform across the currently active threads
 /// of the current wavefront. Otherwise, the result is undefined.
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub unsafe fn readlane_u64(value: u64, lane: u32) -> u64 {
@@ -582,6 +628,8 @@ pub unsafe fn readlane_u64(value: u64, lane: u32) -> u64 {
 ///
 /// `value` is the value returned by `lane`.
 /// `default` is the value returned by all lanes other than `lane`.
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub unsafe fn writelane_u32(value: u32, lane: u32, default: u32) -> u32 {
@@ -596,6 +644,8 @@ pub unsafe fn writelane_u32(value: u32, lane: u32, default: u32) -> u32 {
 ///
 /// `value` is the value returned by `lane`.
 /// `default` is the value returned by all lanes other than `lane`.
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub unsafe fn writelane_u64(value: u64, lane: u32, default: u64) -> u64 {
@@ -605,6 +655,8 @@ pub unsafe fn writelane_u64(value: u64, lane: u32, default: u64) -> u64 {
 /// Stop execution of the wavefront.
 ///
 /// This usually signals the end of a successful execution.
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub fn endpgm() -> ! {
@@ -621,6 +673,8 @@ pub fn endpgm() -> ! {
 /// v_mov_b32 <dest> <old>
 /// v_mov_b32 <dest> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
 /// ```
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub unsafe fn update_dpp<
@@ -651,6 +705,8 @@ pub fn s_memrealtime() -> u64 {
 ///
 /// Reading from inactive lanes returns `0`.
 /// In case multiple values get written to the same `lane`, the value from the source lane with the higher index is taken.
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub unsafe fn ds_permute(lane: u32, value: u32) -> u32 {
@@ -661,6 +717,8 @@ pub unsafe fn ds_permute(lane: u32, value: u32) -> u32 {
 /// Returns the `value` given to `ds_permute` by lane `lane`.
 ///
 /// Reading from inactive lanes returns `0`.
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub unsafe fn ds_bpermute(lane: u32, value: u32) -> u32 {
@@ -680,6 +738,8 @@ pub unsafe fn perm(src0: u32, src1: u32, selector: u32) -> u32 {
 ///
 /// The third and fourth inputs must be uniform across the current wavefront.
 /// These are combined into a single 64-bit value representing lane selects used to swizzle within each row.
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub unsafe fn permlane16_u32<const FI: bool, const BOUND_CONTROL: bool>(
@@ -696,6 +756,8 @@ pub unsafe fn permlane16_u32<const FI: bool, const BOUND_CONTROL: bool>(
 ///
 /// The third and fourth inputs must be uniform across the current wavefront.
 /// These are combined into a single 64-bit value representing lane selects used to swizzle within each row.
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub unsafe fn permlanex16_u32<const FI: bool, const BOUND_CONTROL: bool>(
@@ -718,6 +780,8 @@ pub fn s_get_waveid_in_workgroup() -> u32 {
 /// Swap `value` between upper and lower 32 lanes in a wavefront.
 ///
 /// Does nothing for wave32.
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub unsafe fn permlane64_u32(value: u32) -> u32 {
@@ -728,6 +792,8 @@ pub unsafe fn permlane64_u32(value: u32) -> u32 {
 /// Performs arbitrary gather-style operation within a row (16 contiguous lanes) of the second input operand.
 ///
 /// In contrast to [`permlane16_u32`], allows each lane to specify its own gather lane.
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub unsafe fn permlane16_var<const FI: bool, const BOUND_CONTROL: bool>(
@@ -742,6 +808,8 @@ pub unsafe fn permlane16_var<const FI: bool, const BOUND_CONTROL: bool>(
 /// Performs arbitrary gather-style operation across two rows (16 contiguous lanes) of the second input operand.
 ///
 /// In contrast to [`permlanex16_u32`], allows each lane to specify its own gather lane.
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub unsafe fn permlanex16_var<const FI: bool, const BOUND_CONTROL: bool>(
@@ -766,6 +834,8 @@ pub fn wave_id() -> u32 {
 /// Odd rows of the first operand are swapped with even rows of the second operand (one row is 16 lanes).
 /// Returns a pair for the swapped registers.
 /// The first element of the return corresponds to the swapped element of the first argument.
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub unsafe fn permlane16_swap<const FI: bool, const BOUND_CONTROL: bool>(
@@ -782,6 +852,8 @@ pub unsafe fn permlane16_swap<const FI: bool, const BOUND_CONTROL: bool>(
 /// Rows 2 and 3 of the first operand are swapped with rows 0 and 1 of the second operand (one row is 16 lanes).
 /// Returns a pair for the swapped registers.
 /// The first element of the return corresponds to the swapped element of the first argument.
+///
+#[doc = include_str!("intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
 pub unsafe fn permlane32_swap<const FI: bool, const BOUND_CONTROL: bool>(
diff --git a/crates/core_arch/src/nvptx/mod.rs b/crates/core_arch/src/nvptx/mod.rs
index 5471ef8198..b63a5d01a7 100644
--- a/crates/core_arch/src/nvptx/mod.rs
+++ b/crates/core_arch/src/nvptx/mod.rs
@@ -49,6 +49,8 @@ unsafe extern "C" {
 }
 
 /// Synchronizes all threads in the block.
+///
+#[doc = include_str!("../amdgpu/intrinsic_is_convergent.md")]
 #[inline]
 #[unstable(feature = "stdarch_nvptx", issue = "111199")]
 pub unsafe fn _syncthreads() -> () {

From 4ba4f077b8f306b21aa10d78fc0f1359d745bc6f Mon Sep 17 00:00:00 2001
From: The rustc-josh-sync Cronjob Bot <github-actions@github.com>
Date: Mon, 4 May 2026 05:12:53 +0000
Subject: [PATCH 2/2] Prepare for merging from rust-lang/rust

This updates the rust-version file to 045b17737dab5fcc28e4cbee0cfe2ce4ed363b32.
---
 rust-version | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rust-version b/rust-version
index e9fc6c4cd0..59e9e5a0e6 100644
--- a/rust-version
+++ b/rust-version
@@ -1 +1 @@
-e22c616e4e87914135c1db261a03e0437255335e
+045b17737dab5fcc28e4cbee0cfe2ce4ed363b32