From 18ce64d9d1ef6dc6f01ee54b4d1327ebee2f35e7 Mon Sep 17 00:00:00 2001
From: Jean Boussier <byroot@ruby-lang.org>
Date: Thu, 25 Jun 2026 14:27:22 +0100
Subject: [PATCH 1/2] re.c: Fix conflicting flags for RMatch

`RMATCH_OFFSETS_EXTERNAL` and `MATCH_BUSY` were both assigned to
`FL_USER2`, which could have created weird bugs.

Also add flag documentation for both RMatch and RRegexp.
---
 re.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/re.c b/re.c
index 2005bdde147945..68c6b45235f5b0 100644
--- a/re.c
+++ b/re.c
@@ -32,6 +32,30 @@
 #include "ruby/util.h"
 #include "ractor_core.h"
 
+/* Flags of RRegexp
+ *
+ * 4:     KCODE_FIXED
+ *            The regexp has "fixed encoding", meaning it can't be match against any ASCII-compatible string.
+ * 6:     REG_ENCODING_NONE
+ *            The regexp has no encoding. Means the `n` modifier was used.
+ */
+
+#define KCODE_FIXED FL_USER4
+#define REG_ENCODING_NONE FL_USER6
+
+/* Flags of RMatch
+ *
+ * 0:     MATCH_BUSY
+ *            The match is currently in use or may have escaped and can no longer be recycled.
+ * 1:     RMATCH_ONIG
+ *            TBD.
+ * 2:     RMATCH_OFFSETS_EXTERNAL
+ *            The match layout isn't fully embedded, offsets are stored in an external buffer,
+ *            which will need to be freed during sweep.
+ */
+
+#define MATCH_BUSY FL_USER0
+
 VALUE rb_eRegexpError, rb_eRegexpTimeoutError;
 
 typedef char onig_errmsg_buffer[ONIG_MAX_ERROR_MESSAGE_LEN];
@@ -285,10 +309,6 @@ rb_memsearch(const void *x0, long m, const void *y0, long n, rb_encoding *enc)
     return rb_memsearch_qs(x0, m, y0, n);
 }
 
-#define REG_ENCODING_NONE FL_USER6
-
-#define KCODE_FIXED FL_USER4
-
 static int
 char_to_option(int c)
 {
@@ -1528,8 +1548,6 @@ match_nth_length(VALUE match, VALUE n)
     return LONG2NUM(ofs->end - ofs->beg);
 }
 
-#define MATCH_BUSY FL_USER2
-
 void
 rb_match_busy(VALUE match)
 {

From 5d1a0239de4934cb38b0f7507783d43f0da4ab26 Mon Sep 17 00:00:00 2001
From: Kevin Menard <kevin@nirvdrum.com>
Date: Thu, 25 Jun 2026 13:47:15 -0400
Subject: [PATCH 2/2] ZJIT: Ensure we specialize the results of the last
 inlining operation (#17479)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When we introduced the inliner we also added repeated passes of the optimization pipeline. The idea being that we want to optimize the results of inlining and, because we only inline one level deep, allow us to perform inlining on the result of the last inlining operation. The optimization loop would exit if we couldn't inline any more. If we could inline more, there's an upper bound that kicks us out of the loop so we don't try to inline the world. However, if we exited the loop by hitting that upper bound, we didn't end up specializing the results of the last inlining pass. This PR rectifies that.

This is immediately visible in the 30k_methods benchmark, where performance roughly doubles.

Before:

```
❯ WARMUP_ITRS=0 MIN_BENCH_ITRS=10 MIN_BENCH_TIME=0 ./run_benchmarks.rb --chruby 'ruby-master --zjit-inline-threshold=30' 30k_methods
Running benchmark "30k_methods" (1/1)
+ /Users/nirvdrum/.rubies/ruby-master/bin/ruby --zjit-inline-threshold\=30 -I harness /Users/nirvdrum/dev/worktrees/ruby-bench/main/benchmarks/30k_methods.rb
ruby 4.1.0dev (2026-06-23T13:29:36Z master 13fe77dd2b) +ZJIT dev +PRISM [arm64-darwin25]
itr:   time
 #1: 2689ms
 #2:   33ms
 #3:   32ms
 #4:   32ms
 #5:   32ms
 #6:   32ms
 #7:   32ms
 #8:   35ms
 #9:   33ms
#10:   33ms
```

After:

```
❯ WARMUP_ITRS=0 MIN_BENCH_ITRS=10 MIN_BENCH_TIME=0 ./run_benchmarks.rb --chruby 'ruby-zjit-opt-last-inline --zjit-inline-threshold=30' 30k_methods
Running benchmark "30k_methods" (1/1)
+ /Users/nirvdrum/.rubies/ruby-zjit-opt-last-inline/bin/ruby --zjit-inline-threshold\=30 -I harness /Users/nirvdrum/dev/worktrees/ruby-bench/main/benchmarks/30k_methods.rb
ruby 4.1.0dev (2026-06-25T13:56:41Z zjit-opt-last-inline 18ce64d9d1) +ZJIT dev +PRISM [arm64-darwin25]
itr:   time
 #1: 2700ms
 #2:   17ms
 #3:   16ms
 #4:   16ms
 #5:   17ms
 #6:   16ms
 #7:   16ms
 #8:   17ms
 #9:   16ms
#10:   16ms
```

Fixes https://github.com/Shopify/ruby/issues/998.
---
 zjit/src/hir.rs           | 22 +++++++++-----
 zjit/src/hir/opt_tests.rs | 60 +++++++++++++++++++++++++++++++++++++++
 zjit/src/options.rs       |  9 ++++--
 3 files changed, 81 insertions(+), 10 deletions(-)

diff --git a/zjit/src/hir.rs b/zjit/src/hir.rs
index ac48e538227090..cdd9120af85a92 100644
--- a/zjit/src/hir.rs
+++ b/zjit/src/hir.rs
@@ -6221,19 +6221,27 @@ impl Function {
         }
 
         // The optimization pipeline runs in a fixed-point loop so that inlining and
-        // type specialization can feed each other: the first iteration inlines direct
-        // calls and specializes the inlined code, and subsequent iterations can inline
-        // calls that only became monomorphic after the previous round of specialization.
-        // Termination is guaranteed because each iteration either inlines at least one
-        // call (growing the function toward the inlining budget) or reaches a fixed point.
-        for _ in 0..get_option!(inline_max_iterations) {
+        // type specialization can feed each other: an iteration inlines direct calls and
+        // the next one specializes the freshly inlined code, which in turn can expose
+        // calls that only became monomorphic after that specialization. Inlining naturally
+        // stops when it reaches a fixed point, while inline_max_iterations sets an upper bound
+        // on inlining passes. If we reach the max, we run the loop one more time with inlining
+        // disabled in order to optimize the results of the last inlining operation.
+        let inline_max_iterations = get_option!(inline_max_iterations);
+        for iteration in 0..=inline_max_iterations {
             // Function is assumed to have types inferred already
             run_pass!(type_specialize);
             // The trivial inliner runs first to handle simple cases (constant returns,
             // parameter returns, etc.) without frame push/pop overhead. The general
             // inliner then handles more complex methods that require full inlining.
             run_pass!(inline_trivial);
-            let did_inline = run_pass!(inline_methods);
+            // Cap inlining at inline_max_iterations passes; the trailing iteration (see above)
+            // runs the rest of the pipeline with inlining off.
+            let did_inline = if iteration < inline_max_iterations {
+                run_pass!(inline_methods)
+            } else {
+                false
+            };
             run_pass!(optimize_c_calls);
             run_pass!(convert_no_profile_sends);
             run_pass!(optimize_load_store);
diff --git a/zjit/src/hir/opt_tests.rs b/zjit/src/hir/opt_tests.rs
index c2b31cb2b1a665..933763e2b53660 100644
--- a/zjit/src/hir/opt_tests.rs
+++ b/zjit/src/hir/opt_tests.rs
@@ -17985,6 +17985,66 @@ mod hir_opt_tests {
         ");
     }
 
+    #[test]
+    fn test_final_inline_iteration_specializes_inlined_iseq_send() {
+        eval("
+            def inner(x)
+              x + 1
+            end
+            def outer(x)
+              inner(x)
+            end
+            def test(n)
+              outer(n)
+            end
+            test(1)
+            test(1)
+        ");
+
+        let old_threshold = get_option!(inline_threshold);
+        let old_max_iterations = get_option!(inline_max_iterations);
+        unsafe {
+            OPTIONS.as_mut().unwrap().inline_threshold = 30;
+            OPTIONS.as_mut().unwrap().inline_max_iterations = 1;
+        }
+        let result = hir_string("test");
+        unsafe {
+            OPTIONS.as_mut().unwrap().inline_threshold = old_threshold;
+            OPTIONS.as_mut().unwrap().inline_max_iterations = old_max_iterations;
+        }
+
+        assert!(result.contains("PushInlineFrame"),
+            "Expected outer to be inlined with inline_max_iterations=1:\n{result}");
+        assert!(result.contains(" = SendDirect "),
+            "Expected the Send inside the final inlined body to be specialized to SendDirect:\n{result}");
+        assert!(!result.contains(" = Send "),
+            "Expected no unspecialized Send after the final specialization round:\n{result}");
+
+        assert_snapshot!(result, @"
+        fn test@<compiled>:9:
+        bb1():
+          EntryPoint interpreter
+          v1:BasicObject = LoadSelf
+          v2:CPtr = LoadSP
+          v3:BasicObject = LoadField v2, :n@0x1000
+          Jump bb3(v1, v3)
+        bb2():
+          EntryPoint JIT(0)
+          v6:BasicObject = LoadArg :self@0
+          v7:BasicObject = LoadArg :n@1
+          Jump bb3(v6, v7)
+        bb3(v9:BasicObject, v10:BasicObject):
+          PatchPoint MethodRedefined(Object@0x1008, outer@0x1010, cme:0x1018)
+          v23:ObjectSubclass[class_exact*:Object@VALUE(0x1008)] = GuardType v9, ObjectSubclass[class_exact*:Object@VALUE(0x1008)] recompile
+          PushInlineFrame v23 (0x1040), v10
+          PatchPoint MethodRedefined(Object@0x1008, inner@0x1048, cme:0x1050)
+          v43:BasicObject = SendDirect v23, 0x1078, :inner (0x1088), v10
+          CheckInterrupts
+          PopInlineFrame
+          Return v43
+        ");
+    }
+
     #[test]
     fn test_inline_budget_rejects_when_exceeded() {
         // The same workload as test_inline_arithmetic_method, which we know inlines
diff --git a/zjit/src/options.rs b/zjit/src/options.rs
index ba187dea5a253d..bf09235002dd8d 100644
--- a/zjit/src/options.rs
+++ b/zjit/src/options.rs
@@ -178,9 +178,12 @@ pub struct Options {
     /// Upper bound on how many times the `optimize` fixed-point loop will iterate
     /// before giving up. Each iteration runs `type_specialize` → `inline` →
     /// `inline_methods` → the rest of the HIR pipeline; in steady state the loop
-    /// terminates as soon as an iteration fails to inline anything new. The cap
-    /// exists to bound compile time when something pathological prevents the loop
-    /// from reaching a fixed point.
+    /// terminates as soon as an iteration fails to inline anything new. If the
+    /// cap is hit while inlining is still ongoing, the optimizer runs one final
+    /// specialization/cleanup round without `inline_methods`, so the callee HIR
+    /// inserted by the last iteration does not keep unspecialized `Send`s. The
+    /// cap exists to bound compile time when something pathological prevents the
+    /// loop from reaching a fixed point.
     pub inline_max_iterations: InlineDepth,
 }