From 94f5c444fdef07823f0d71796878ea0ff4767f30 Mon Sep 17 00:00:00 2001
From: Derek Bruening <bruening@google.com>
Date: Thu, 3 Oct 2024 20:02:30 -0400
Subject: [PATCH 01/15] i#6959: Add exit_if_fraction_inputs_left option (#7018)

Adds a new scheduler feature and CLI option exit_if_fraction_inputs_left. This
applies to -core_sharded and -core_serial modes. When an input reaches
EOF, if the number of non-EOF inputs left as a fraction of the original
inputs is equal to or less than this value then the scheduler exits
(sets all outputs to EOF) rather than finishing off the final inputs.
This helps avoid long sequences of idles during staggered endings with
fewer inputs left than cores and only a small fraction of the total
instructions left in those inputs.

The default value in scheduler_options_t and the CLI option is 0.05 (i.e., 5%),
which when tested on an large internal trace helps eliminate much of the
final idle time from the cores without losing many instructions.

Compare the numbers below for today's default with a long idle time and
so distinct differences between the "cpu busy by time" and "cpu busy by
time, ignoring idle past last instr" stats on a 39-core schedule-stats
run of a moderately large trace, with key stats and the 1st 2 cores (for
brevity) shown here:

```
  1567052521 instructions
   878027975 idles
       64.09% cpu busy by record count
       82.38% cpu busy by time
       96.81% cpu busy by time, ignoring idle past last instr
Core #0 schedule: CccccccOXHhUuuuuAaSEOGOWEWQqqqFffIiTETENWwwOWEeeeeeeACMmTQFfOWLWVvvvvFQqqqqYOWOooOWOYOYQOWO_O_W_O_W_O_W_O_WO_WO_O_O_O_O_O_OR_R_R_R_R_R_R_R_R_R_R_R_R_R_R_R_R_R_R_R_R_R_R_R_R_RY_YyyyySUuuOSISO_S_S_SOPpSOKO_KO_KCcDKWDB_B_____________________________________________
Core #1 schedule: KkLWSFUQPDddddddddXxSUSVRJWKkRNJBWUWwwTttGgRNKkkRWNTtFRWKkRNWUuuGULRFSRSYKkkkRYAYFffGSRYHRYHNWMDddddddddRYGgggggYHNWK_YAHYNnGYSNHWwwwwSWSNKSYyyWKNNWKNNGAKWGggNnNW_NNWE_E_EF__________________________________________________
```

And now with -exit_if_fraction_inputs_left 0.05, where we lose (1567052521 -
1564522227)/1567052521. = 0.16% of the instructions but drastically
reduce the tail from 14% of the time to less than 1% of the time:

```
  1564522227 instructions
   120512812 idles
       92.85% cpu busy by record count
       96.39% cpu busy by time
       97.46% cpu busy by time, ignoring idle past last instr
Core #0 schedule: CccccccOXHKYEGGETRARrrPRTVvvvRrrNWwwOOKWVRRrPBbbXUVvvvvvOWKVLWVvvJjSOWKVUuTIiiiFPpppKAaaMFfffAHOKWAaGNBOWKAPPOABCWKPWOKWPCXxxxZOWKCccJSOSWKJUYRCOWKCcSOSUKkkkOROK_O_O_O_O_O
Core #1 schedule: KkLWSMmmFLSFffffffJjWBbGBUuuuuuuuuuuBDBJJRJWKkRNJWMBKkkRNWKkRNWKkkkRNWXxxxxxZOooAaUIiTHhhhSDNnnnHZzQNnnRNWXxxxxxRNWUuuRNWKXUuXRNKRWKNXxxRWKONNHRKWONURKWXRKXRKNW_KR_KkRK_KRKR_R_R_R_R_R_R_R_R_R_R_R__R__R__R___R___R___R___R___R
```

Fixes #6959
---
 clients/drcachesim/analyzer_multi.cpp         |   2 +
 clients/drcachesim/common/options.cpp         |  12 ++
 clients/drcachesim/common/options.h           |   1 +
 clients/drcachesim/scheduler/scheduler.cpp    |  69 +++++++--
 clients/drcachesim/scheduler/scheduler.h      |  16 ++-
 .../drcachesim/tests/scheduler_unit_tests.cpp | 136 ++++++++++++++++--
 6 files changed, 208 insertions(+), 28 deletions(-)

diff --git a/clients/drcachesim/analyzer_multi.cpp b/clients/drcachesim/analyzer_multi.cpp
index beb0e526e6f..c27f231831c 100644
--- a/clients/drcachesim/analyzer_multi.cpp
+++ b/clients/drcachesim/analyzer_multi.cpp
@@ -574,6 +574,8 @@ analyzer_multi_tmpl_t<RecordType, ReaderType>::init_dynamic_schedule()
     sched_ops.rebalance_period_us = op_sched_rebalance_period_us.get_value();
     sched_ops.randomize_next_input = op_sched_randomize.get_value();
     sched_ops.honor_direct_switches = !op_sched_disable_direct_switches.get_value();
+    sched_ops.exit_if_fraction_inputs_left =
+        op_sched_exit_if_fraction_inputs_left.get_value();
 #ifdef HAS_ZIP
     if (!op_record_file.get_value().empty()) {
         record_schedule_zip_.reset(new zipfile_ostream_t(op_record_file.get_value()));
diff --git a/clients/drcachesim/common/options.cpp b/clients/drcachesim/common/options.cpp
index 889a0225a7c..10de4990314 100644
--- a/clients/drcachesim/common/options.cpp
+++ b/clients/drcachesim/common/options.cpp
@@ -1034,6 +1034,18 @@ droption_t<uint64_t> op_sched_rebalance_period_us(
     "The period in simulated microseconds at which per-core run queues are re-balanced "
     "to redistribute load.");
 
+droption_t<double> op_sched_exit_if_fraction_inputs_left(
+    DROPTION_SCOPE_FRONTEND, "sched_exit_if_fraction_inputs_left", 0.05,
+    "Exit if non-EOF inputs left are <= this fraction of the total",
+    "Applies to -core_sharded and -core_serial.  When an input reaches EOF, if the "
+    "number of non-EOF inputs left as a fraction of the original inputs is equal to or "
+    "less than this value then the scheduler exits (sets all outputs to EOF) rather than "
+    "finishing off the final inputs.  This helps avoid long sequences of idles during "
+    "staggered endings with fewer inputs left than cores and only a small fraction of "
+    "the total instructions left in those inputs.  Since the remaining instruction "
+    "count is not considered (as it is not available), use discretion when raising "
+    "this value on uneven inputs.");
+
 // Schedule_stats options.
 droption_t<uint64_t>
     op_schedule_stats_print_every(DROPTION_SCOPE_ALL, "schedule_stats_print_every",
diff --git a/clients/drcachesim/common/options.h b/clients/drcachesim/common/options.h
index 75633a1928c..ef24824b4d0 100644
--- a/clients/drcachesim/common/options.h
+++ b/clients/drcachesim/common/options.h
@@ -221,6 +221,7 @@ extern dynamorio::droption::droption_t<bool> op_sched_infinite_timeouts;
 extern dynamorio::droption::droption_t<uint64_t> op_sched_migration_threshold_us;
 extern dynamorio::droption::droption_t<uint64_t> op_sched_rebalance_period_us;
 extern dynamorio::droption::droption_t<double> op_sched_time_units_per_us;
+extern dynamorio::droption::droption_t<double> op_sched_exit_if_fraction_inputs_left;
 extern dynamorio::droption::droption_t<uint64_t> op_schedule_stats_print_every;
 extern dynamorio::droption::droption_t<std::string> op_syscall_template_file;
 extern dynamorio::droption::droption_t<uint64_t> op_filter_stop_timestamp;
diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp
index e9ae6114265..16fb96798bf 100644
--- a/clients/drcachesim/scheduler/scheduler.cpp
+++ b/clients/drcachesim/scheduler/scheduler.cpp
@@ -725,6 +725,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::print_configuration()
            options_.rebalance_period_us);
     VPRINT(this, 1, "  %-25s : %d\n", "honor_infinite_timeouts",
            options_.honor_infinite_timeouts);
+    VPRINT(this, 1, "  %-25s : %f\n", "exit_if_fraction_inputs_left",
+           options_.exit_if_fraction_inputs_left);
 }
 
 template <typename RecordType, typename ReaderType>
@@ -1027,6 +1029,11 @@ scheduler_tmpl_t<RecordType, ReaderType>::legacy_field_support()
         error_string_ = "block_time_max_us must be > 0";
         return STATUS_ERROR_INVALID_PARAMETER;
     }
+    if (options_.exit_if_fraction_inputs_left < 0. ||
+        options_.exit_if_fraction_inputs_left > 1.) {
+        error_string_ = "exit_if_fraction_inputs_left must be 0..1";
+        return STATUS_ERROR_INVALID_PARAMETER;
+    }
     return STATUS_SUCCESS;
 }
 
@@ -2339,7 +2346,11 @@ scheduler_tmpl_t<RecordType, ReaderType>::advance_region_of_interest(
                         return status;
                 }
                 input.queue.push_back(create_thread_exit(input.tid));
-                mark_input_eof(input);
+                sched_type_t::stream_status_t status = mark_input_eof(input);
+                // For early EOF we still need our synthetic exit so do not return it yet.
+                if (status != sched_type_t::STATUS_OK &&
+                    status != sched_type_t::STATUS_EOF)
+                    return status;
                 return sched_type_t::STATUS_SKIPPED;
             }
         }
@@ -2466,7 +2477,9 @@ scheduler_tmpl_t<RecordType, ReaderType>::skip_instructions(input_info_t &input,
         input.instrs_pre_read = 0;
     }
     if (*input.reader == *input.reader_end) {
-        mark_input_eof(input);
+        sched_type_t::stream_status_t status = mark_input_eof(input);
+        if (status != sched_type_t::STATUS_OK)
+            return status;
         // Raise error because the input region is out of bounds, unless the max
         // was used which we ourselves use internally for times_of_interest.
         if (skip_amount >= std::numeric_limits<uint64_t>::max() - 2) {
@@ -3109,11 +3122,13 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input_as_previously(
         // queued candidate record, if any.
         clear_input_queue(inputs_[index]);
         inputs_[index].queue.push_back(create_thread_exit(inputs_[index].tid));
-        mark_input_eof(inputs_[index]);
         VPRINT(this, 2, "early end for input %d\n", index);
         // We're done with this entry but we need the queued record to be read,
         // so we do not move past the entry.
         outputs_[output].record_index->fetch_add(1, std::memory_order_release);
+        sched_type_t::stream_status_t status = mark_input_eof(inputs_[index]);
+        if (status != sched_type_t::STATUS_OK)
+            return status;
         return sched_type_t::STATUS_SKIPPED;
     } else if (segment.type == schedule_record_t::SKIP) {
         std::lock_guard<mutex_dbg_owned> lock(*inputs_[index].lock);
@@ -3456,8 +3471,11 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input(output_ordinal_t outpu
         if (inputs_[index].at_eof ||
             *inputs_[index].reader == *inputs_[index].reader_end) {
             VPRINT(this, 2, "next_record[%d]: input #%d at eof\n", output, index);
-            if (!inputs_[index].at_eof)
-                mark_input_eof(inputs_[index]);
+            if (!inputs_[index].at_eof) {
+                sched_type_t::stream_status_t status = mark_input_eof(inputs_[index]);
+                if (status != sched_type_t::STATUS_OK)
+                    return status;
+            }
             index = INVALID_INPUT_ORDINAL;
             // Loop and pick next thread.
             continue;
@@ -3802,8 +3820,11 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
                 input->needs_advance = true;
             }
             if (input->at_eof || *input->reader == *input->reader_end) {
-                if (!input->at_eof)
-                    mark_input_eof(*input);
+                if (!input->at_eof) {
+                    sched_type_t::stream_status_t status = mark_input_eof(*input);
+                    if (status != sched_type_t::STATUS_OK)
+                        return status;
+                }
                 lock.unlock();
                 VPRINT(this, 5, "next_record[%d]: need new input (cur=%d eof)\n", output,
                        input->index);
@@ -4167,17 +4188,28 @@ scheduler_tmpl_t<RecordType, ReaderType>::stop_speculation(output_ordinal_t outp
 }
 
 template <typename RecordType, typename ReaderType>
-void
+typename scheduler_tmpl_t<RecordType, ReaderType>::stream_status_t
 scheduler_tmpl_t<RecordType, ReaderType>::mark_input_eof(input_info_t &input)
 {
     assert(input.lock->owned_by_cur_thread());
     if (input.at_eof)
-        return;
+        return sched_type_t::STATUS_OK;
     input.at_eof = true;
-    assert(live_input_count_.load(std::memory_order_acquire) > 0);
-    live_input_count_.fetch_add(-1, std::memory_order_release);
-    VPRINT(this, 2, "input %d at eof; %d live inputs left\n", input.index,
-           live_input_count_.load(std::memory_order_acquire));
+#ifndef NDEBUG
+    int old_count =
+#endif
+        live_input_count_.fetch_add(-1, std::memory_order_release);
+    assert(old_count > 0);
+    int live_inputs = live_input_count_.load(std::memory_order_acquire);
+    VPRINT(this, 2, "input %d at eof; %d live inputs left\n", input.index, live_inputs);
+    if (options_.mapping == MAP_TO_ANY_OUTPUT &&
+        live_inputs <=
+            static_cast<int>(inputs_.size() * options_.exit_if_fraction_inputs_left)) {
+        VPRINT(this, 1, "exiting early at input %d with %d live inputs left\n",
+               input.index, live_inputs);
+        return sched_type_t::STATUS_EOF;
+    }
+    return sched_type_t::STATUS_OK;
 }
 
 template <typename RecordType, typename ReaderType>
@@ -4188,8 +4220,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::eof_or_idle(output_ordinal_t output,
     // XXX i#6831: Refactor to use subclasses or templates to specialize
     // scheduler code based on mapping options, to avoid these top-level
     // conditionals in many functions?
-    if (options_.mapping == MAP_TO_CONSISTENT_OUTPUT ||
-        live_input_count_.load(std::memory_order_acquire) == 0 ||
+    int live_inputs = live_input_count_.load(std::memory_order_acquire);
+    if (options_.mapping == MAP_TO_CONSISTENT_OUTPUT || live_inputs == 0 ||
         // While a full schedule recorded should have each input hit either its
         // EOF or ROI end, we have a fallback to avoid hangs for possible recorded
         // schedules that end an input early deliberately without an ROI.
@@ -4198,6 +4230,13 @@ scheduler_tmpl_t<RecordType, ReaderType>::eof_or_idle(output_ordinal_t output,
         assert(options_.mapping != MAP_AS_PREVIOUSLY || outputs_[output].at_eof);
         return sched_type_t::STATUS_EOF;
     }
+    if (options_.mapping == MAP_TO_ANY_OUTPUT &&
+        live_inputs <=
+            static_cast<int>(inputs_.size() * options_.exit_if_fraction_inputs_left)) {
+        VPRINT(this, 1, "output %d exiting early with %d live inputs left\n", output,
+               live_inputs);
+        return sched_type_t::STATUS_EOF;
+    }
     // Before going idle, try to steal work from another output.
     // We start with us+1 to avoid everyone stealing from the low-numbered outputs.
     // We only try when we first transition to idle; we rely on rebalancing after that,
diff --git a/clients/drcachesim/scheduler/scheduler.h b/clients/drcachesim/scheduler/scheduler.h
index 48f41280e02..84b0cbe31f1 100644
--- a/clients/drcachesim/scheduler/scheduler.h
+++ b/clients/drcachesim/scheduler/scheduler.h
@@ -805,6 +805,17 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
          * (#block_time_max_us) scaled by #block_time_multiplier.
          */
         bool honor_infinite_timeouts = false;
+        /**
+         * For #MAP_TO_ANY_OUTPUT, when an input reaches EOF, if the number of non-EOF
+         * inputs left as a fraction of the original inputs is equal to or less than
+         * this value then the scheduler exits (sets all outputs to EOF) rather than
+         * finishing off the final inputs.  This helps avoid long sequences of idles
+         * during staggered endings with fewer inputs left than cores and only a small
+         * fraction of the total instructions left in those inputs.  Since the remaining
+         * instruction count is not considered (as it is not available), use discretion
+         * when raising this value on uneven inputs.
+         */
+        double exit_if_fraction_inputs_left = 0.05;
         // When adding new options, also add to print_configuration().
     };
 
@@ -2021,7 +2032,10 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
     set_output_active(output_ordinal_t output, bool active);
 
     // Caller must hold the input's lock.
-    void
+    // The return value is STATUS_EOF if a global exit is now happening (an
+    // early exit); otherwise STATUS_OK is returned on success but only a
+    // local EOF.
+    stream_status_t
     mark_input_eof(input_info_t &input);
 
     // Determines whether to exit or wait for other outputs when one output
diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp
index d88977f2dbd..21f8fba7a6c 100644
--- a/clients/drcachesim/tests/scheduler_unit_tests.cpp
+++ b/clients/drcachesim/tests/scheduler_unit_tests.cpp
@@ -1598,6 +1598,8 @@ test_synthetic_with_timestamps()
                                                scheduler_t::SCHEDULER_DEFAULTS,
                                                /*verbosity=*/3);
     sched_ops.quantum_duration_instrs = 3;
+    // Test dropping a final "_" from core0.
+    sched_ops.exit_if_fraction_inputs_left = 0.1;
     scheduler_t scheduler;
     if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
         scheduler_t::STATUS_SUCCESS)
@@ -1615,9 +1617,8 @@ test_synthetic_with_timestamps()
     // thus core0 has C,I,B,H,D and core1 has F,J,E,A,G.
     // We should interleave within each group -- except once we reach J
     // we should completely finish it.  There should be no migrations.
-    assert(
-        sched_as_string[0] ==
-        ".CC.C.II.IC.CC.I.II.CC.C.II.I..BB.B.HH.HB.BB.H.HH.BB.B.HH.H..DD.DD.DD.DD.D._");
+    assert(sched_as_string[0] ==
+           ".CC.C.II.IC.CC.I.II.CC.C.II.I..BB.B.HH.HB.BB.H.HH.BB.B.HH.H..DD.DD.DD.DD.D.");
     assert(sched_as_string[1] ==
            ".FF.F.JJ.JJ.JJ.JJ.J.F.FF.FF.F..EE.EE.EE.EE.E..AA.A.GG.GA.AA.G.GG.AA.A.GG.G.");
     // Check scheduler stats.  # switches is the # of letter transitions; # preempts
@@ -1625,7 +1626,7 @@ test_synthetic_with_timestamps()
     // appearing in between (and ignoring the last letter for an input: EOF doesn't
     // count as a preempt).
     verify_scheduler_stats(scheduler.get_stream(0), /*switch_input_to_input=*/12,
-                           /*switch_input_to_idle=*/1, /*switch_idle_to_input=*/0,
+                           /*switch_input_to_idle=*/0, /*switch_idle_to_input=*/0,
                            /*switch_nop=*/2, /*preempts=*/10, /*direct_attempts=*/0,
                            /*direct_successes=*/0, /*migrations=*/0);
     verify_scheduler_stats(scheduler.get_stream(1), /*switch_input_to_input=*/9,
@@ -1699,6 +1700,8 @@ test_synthetic_with_priorities()
                                                scheduler_t::SCHEDULER_DEFAULTS,
                                                /*verbosity=*/3);
     sched_ops.quantum_duration_instrs = 3;
+    // Test dropping a final "_" from core0.
+    sched_ops.exit_if_fraction_inputs_left = 0.1;
     scheduler_t scheduler;
     if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
         scheduler_t::STATUS_SUCCESS)
@@ -1711,9 +1714,8 @@ test_synthetic_with_priorities()
     // See the test_synthetic_with_timestamps() test which has our base sequence.
     // We've elevated B, E, and H to higher priorities so they go
     // first.  J remains uninterrupted due to lower timestamps.
-    assert(
-        sched_as_string[0] ==
-        ".BB.B.HH.HB.BB.H.HH.BB.B.HH.H..FF.F.JJ.JJ.JJ.JJ.J.F.FF.FF.F..DD.DD.DD.DD.D._");
+    assert(sched_as_string[0] ==
+           ".BB.B.HH.HB.BB.H.HH.BB.B.HH.H..FF.F.JJ.JJ.JJ.JJ.J.F.FF.FF.F..DD.DD.DD.DD.D.");
     assert(sched_as_string[1] ==
            ".EE.EE.EE.EE.E..CC.C.II.IC.CC.I.II.CC.C.II.I..AA.A.GG.GA.AA.G.GG.AA.A.GG.G.");
     // Check scheduler stats.  # switches is the # of letter transitions; # preempts
@@ -1721,7 +1723,7 @@ test_synthetic_with_priorities()
     // appearing in between (and ignoring the last letter for an input: EOF doesn't
     // count as a preempt).
     verify_scheduler_stats(scheduler.get_stream(0), /*switch_input_to_input=*/9,
-                           /*switch_input_to_idle=*/1, /*switch_idle_to_input=*/0,
+                           /*switch_input_to_idle=*/0, /*switch_idle_to_input=*/0,
                            /*switch_nop=*/5, /*preempts=*/10, /*direct_attempts=*/0,
                            /*direct_successes=*/0, /*migrations=*/0);
     verify_scheduler_stats(scheduler.get_stream(1), /*switch_input_to_input=*/12,
@@ -2032,6 +2034,8 @@ test_synthetic_with_syscalls_multiple()
     sched_ops.time_units_per_us = 1.;
     sched_ops.blocking_switch_threshold = BLOCK_LATENCY;
     sched_ops.block_time_multiplier = BLOCK_SCALE;
+    // Test dropping a bunch of final "_" from core1.
+    sched_ops.exit_if_fraction_inputs_left = 0.1;
     scheduler_t scheduler;
     if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
         scheduler_t::STATUS_SUCCESS)
@@ -2054,9 +2058,8 @@ test_synthetic_with_syscalls_multiple()
     // explains why the two strings are different lengths.
     assert(sched_as_string[0] ==
            "BHHHFFFJJJJJJBHHHJJJFFFFFFBHHHDDDDDDDDDB__________B__________B__________B____"
-           "______B_______B");
-    assert(sched_as_string[1] ==
-           "EECCCIIICCCIIIEECCCIIIAAAGGGEEAAAGGEEGAAEGGAG_________");
+           "______B__________B");
+    assert(sched_as_string[1] == "EECCCIIICCCIIIEECCCIIIAAAGGGEEAAAGGEEGAAEGGAG");
     // Check scheduler stats.  # switches is the # of letter transitions; # preempts
     // is the instances where the same letter appears 3 times without another letter
     // appearing in between (and ignoring the last letter for an input: EOF doesn't
@@ -2066,7 +2069,7 @@ test_synthetic_with_syscalls_multiple()
                            /*switch_nop=*/4, /*preempts=*/10, /*direct_attempts=*/0,
                            /*direct_successes=*/0, /*migrations=*/0);
     verify_scheduler_stats(scheduler.get_stream(1), /*switch_input_to_input=*/19,
-                           /*switch_input_to_idle=*/1, /*switch_idle_to_input=*/0,
+                           /*switch_input_to_idle=*/0, /*switch_idle_to_input=*/0,
                            /*switch_nop=*/3, /*preempts=*/16, /*direct_attempts=*/0,
                            /*direct_successes=*/0, /*migrations=*/0);
 }
@@ -6001,6 +6004,114 @@ test_rebalancing()
     }
 }
 
+static void
+test_exit_early()
+{
+    std::cerr << "\n----------------\nTesting exiting early\n";
+    static constexpr int NUM_INPUTS = 12;
+    static constexpr int NUM_OUTPUTS = 2;
+    static constexpr int NUM_INSTRS = 9;
+    static constexpr int QUANTUM_DURATION = 3;
+    static constexpr memref_tid_t TID_BASE = 100;
+    static constexpr uint64_t TIMESTAMP = 101;
+    static constexpr uint64_t BLOCK_LATENCY = 1500;
+    std::vector<trace_entry_t> inputs[NUM_INPUTS];
+    for (int i = 0; i < NUM_INPUTS; i++) {
+        memref_tid_t tid = TID_BASE + i;
+        inputs[i].push_back(make_thread(tid));
+        inputs[i].push_back(make_pid(1));
+        inputs[i].push_back(make_version(TRACE_ENTRY_VERSION));
+        inputs[i].push_back(make_timestamp(TIMESTAMP)); // All the same time priority.
+        for (int j = 0; j < NUM_INSTRS; j++) {
+            inputs[i].push_back(make_instr(42 + j * 4));
+            // One input has a long blocking syscall toward the end.
+            if (i == 0 && j == NUM_INSTRS - 2) {
+                inputs[i].push_back(make_timestamp(TIMESTAMP));
+                inputs[i].push_back(make_marker(TRACE_MARKER_TYPE_SYSCALL, 42));
+                inputs[i].push_back(
+                    make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0));
+                inputs[i].push_back(make_timestamp(TIMESTAMP + BLOCK_LATENCY));
+            }
+        }
+        inputs[i].push_back(make_exit(tid));
+    }
+    {
+        // Run without any early exit.
+        std::vector<scheduler_t::input_workload_t> sched_inputs;
+        for (int i = 0; i < NUM_INPUTS; i++) {
+            std::vector<scheduler_t::input_reader_t> readers;
+            readers.emplace_back(
+                std::unique_ptr<mock_reader_t>(new mock_reader_t(inputs[i])),
+                std::unique_ptr<mock_reader_t>(new mock_reader_t()), TID_BASE + i);
+            sched_inputs.emplace_back(std::move(readers));
+        }
+        scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
+                                                   scheduler_t::DEPENDENCY_IGNORE,
+                                                   scheduler_t::SCHEDULER_DEFAULTS,
+                                                   /*verbosity=*/2);
+        // We use our mock's time==instruction count for a deterministic result.
+        sched_ops.time_units_per_us = 1.;
+        sched_ops.quantum_duration_instrs = QUANTUM_DURATION;
+        sched_ops.blocking_switch_threshold = BLOCK_LATENCY;
+        sched_ops.exit_if_fraction_inputs_left = 0.;
+        scheduler_t scheduler;
+        if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
+            scheduler_t::STATUS_SUCCESS)
+            assert(false);
+        std::vector<std::string> sched_as_string =
+            run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/true);
+        for (int i = 0; i < NUM_OUTPUTS; i++) {
+            std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n";
+        }
+        // We have a long idle wait just to execute A's final instruction.
+        static const char *const CORE0_SCHED_STRING =
+            "..AAA..CCC..EEE..GGG..III..KKKAAACCCEEEGGGIIIKKKAA....CCC.EEE.GGG.III.KKK.__"
+            "_________________________________________________________________A.";
+        static const char *const CORE1_SCHED_STRING =
+            "..BBB..DDD..FFF..HHH..JJJ..LLLBBBDDDFFFHHHJJJLLLBBB.DDD.FFF.HHH.JJJ.LLL.____"
+            "___________________________________________________________________";
+        assert(sched_as_string[0] == CORE0_SCHED_STRING);
+        assert(sched_as_string[1] == CORE1_SCHED_STRING);
+    }
+    {
+        // Run with any early exit.
+        std::vector<scheduler_t::input_workload_t> sched_inputs;
+        for (int i = 0; i < NUM_INPUTS; i++) {
+            std::vector<scheduler_t::input_reader_t> readers;
+            readers.emplace_back(
+                std::unique_ptr<mock_reader_t>(new mock_reader_t(inputs[i])),
+                std::unique_ptr<mock_reader_t>(new mock_reader_t()), TID_BASE + i);
+            sched_inputs.emplace_back(std::move(readers));
+        }
+        scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
+                                                   scheduler_t::DEPENDENCY_IGNORE,
+                                                   scheduler_t::SCHEDULER_DEFAULTS,
+                                                   /*verbosity=*/2);
+        // We use our mock's time==instruction count for a deterministic result.
+        sched_ops.time_units_per_us = 1.;
+        sched_ops.quantum_duration_instrs = QUANTUM_DURATION;
+        sched_ops.blocking_switch_threshold = BLOCK_LATENCY;
+        // NUM_INPUTS=11 * 0.1 = 1.1 so we'll exit with 1 input left.
+        sched_ops.exit_if_fraction_inputs_left = 0.1;
+        scheduler_t scheduler;
+        if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
+            scheduler_t::STATUS_SUCCESS)
+            assert(false);
+        std::vector<std::string> sched_as_string =
+            run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/true);
+        for (int i = 0; i < NUM_OUTPUTS; i++) {
+            std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n";
+        }
+        // Now we exit after K and never execute the 9th A.
+        static const char *const CORE0_SCHED_STRING =
+            "..AAA..CCC..EEE..GGG..III..KKKAAACCCEEEGGGIIIKKKAA....CCC.EEE.GGG.III.KKK.";
+        static const char *const CORE1_SCHED_STRING =
+            "..BBB..DDD..FFF..HHH..JJJ..LLLBBBDDDFFFHHHJJJLLLBBB.DDD.FFF.HHH.JJJ.LLL.__";
+        assert(sched_as_string[0] == CORE0_SCHED_STRING);
+        assert(sched_as_string[1] == CORE1_SCHED_STRING);
+    }
+}
+
 int
 test_main(int argc, const char *argv[])
 {
@@ -6042,6 +6153,7 @@ test_main(int argc, const char *argv[])
     test_random_schedule();
     test_record_scheduler();
     test_rebalancing();
+    test_exit_early();
 
     dr_standalone_exit();
     return 0;

From dff98b7a4a121db71dcea8ce271a7d355a888589 Mon Sep 17 00:00:00 2001
From: Derek Bruening <bruening@google.com>
Date: Fri, 4 Oct 2024 14:31:43 -0400
Subject: [PATCH 02/15] i#6971: Use instr count instead of wallclock for
 simulated time (#7015)

When using the drmemtrace scheduler in an analyzer or other tool that
does not track simulated time with the default QUANTUM_INSTRUCTIONS, the
scheduler used to use wall-clock time to measure blocking-input and idle
time. Here we change that to use the instruction count plus the idle
count via a new idle counter.

The time_units_per_us and sched_time_units_per_us defaults are set to
1000, reflecting a 2gHz machine with IPC=0.5.
The old time_units_per_us=100 for wall clock was too low; to match it
with counts, we need a low sched_time_units_per_us: 500 is better than
1000, but that seems unrealisitc. Instead we can get the results we want
from our large traces by exiting earlier, since most of the unwanted
idle is still in seemingly unrepresentative regions at the end. We raise
exit_if_fraction_left from 0.05 to 0.1 here.

Using counters provides a more reproducible result across different runs
and machines.

Wall-clock time is still used to measure idle time on replay. Switching
to the idle count added here is left as separate work under #7023.
(Replay also uses wall-clock time to coordinate concurrent outputs
beyond shared input constraints; that will likely always remain.)

The new default values of the options were tested on larger traces and
found to produce a representative level of idle time.

This change means that the clock going backward problem (#6966) is no
longer seen in default runs. The analyzer still supports wall-clock with
the -sched_time option so a check to avoid underflow is added.

Fixes #6971
Fixes #6966
---
 clients/drcachesim/analyzer.cpp               |  2 +-
 clients/drcachesim/common/options.cpp         | 25 ++++++-------
 clients/drcachesim/scheduler/scheduler.cpp    | 35 +++++++++++++------
 clients/drcachesim/scheduler/scheduler.h      | 33 ++++++++++++-----
 .../drcachesim/tests/scheduler_unit_tests.cpp |  4 +--
 5 files changed, 66 insertions(+), 33 deletions(-)

diff --git a/clients/drcachesim/analyzer.cpp b/clients/drcachesim/analyzer.cpp
index a4f860f4ec4..f2c2ddb5098 100644
--- a/clients/drcachesim/analyzer.cpp
+++ b/clients/drcachesim/analyzer.cpp
@@ -524,7 +524,7 @@ analyzer_tmpl_t<RecordType, ReaderType>::process_serial(analyzer_worker_data_t &
     while (true) {
         RecordType record;
         // The current time is used for time quanta; for instr quanta, it's ignored and
-        // we pass 0.
+        // we pass 0 and let the scheduler use instruction + idle counts.
         uint64_t cur_micros = sched_by_time_ ? get_current_microseconds() : 0;
         typename sched_type_t::stream_status_t status =
             worker.stream->next_record(record, cur_micros);
diff --git a/clients/drcachesim/common/options.cpp b/clients/drcachesim/common/options.cpp
index 10de4990314..5166a8bcd63 100644
--- a/clients/drcachesim/common/options.cpp
+++ b/clients/drcachesim/common/options.cpp
@@ -908,17 +908,17 @@ droption_t<int64_t>
     // We pick 10 million to match 2 instructions per nanosecond with a 5ms quantum.
     op_sched_quantum(DROPTION_SCOPE_ALL, "sched_quantum", 10 * 1000 * 1000,
                      "Scheduling quantum",
-                     "Applies to -core_sharded and -core_serial. "
-                     "Scheduling quantum in instructions, unless -sched_time is set in "
-                     "which case this value is multiplied by -sched_time_per_us to "
-                     "produce a quantum in wall-clock microseconds.");
+                     "Applies to -core_sharded and -core_serial.  Scheduling quantum in "
+                     "instructions, unless -sched_time is set in which case this value "
+                     "is the quantum in simulated microseconds (equal to wall-clock "
+                     "microseconds multiplied by -sched_time_per_us).");
 
 droption_t<bool>
     op_sched_time(DROPTION_SCOPE_ALL, "sched_time", false,
                   "Whether to use time for the scheduling quantum",
-                  "Applies to -core_sharded and -core_serial. "
-                  "Whether to use wall-clock time for the scheduling quantum, with a "
-                  "value equal to -sched_quantum in microseconds of wall-clock time.");
+                  "Applies to -core_sharded and -core_serial.  Whether to use wall-clock "
+                  "time (multiplied by -sched_time_per_us) for measuring idle time and "
+                  "for the scheduling quantum (see -sched_quantum).");
 
 droption_t<bool> op_sched_order_time(DROPTION_SCOPE_ALL, "sched_order_time", true,
                                      "Whether to honor recorded timestamps for ordering",
@@ -1016,11 +1016,12 @@ droption_t<bool> op_sched_infinite_timeouts(
     "(set to false).");
 
 droption_t<double> op_sched_time_units_per_us(
-    DROPTION_SCOPE_ALL, "sched_time_units_per_us", 100.,
+    DROPTION_SCOPE_ALL, "sched_time_units_per_us", 1000.,
     "Time units per simulated microsecond",
-    "Time units (currently wall-clock time) per simulated microsecond.  This scales all "
-    "of the -sched_*_us values as it converts wall-clock time into the simulated "
-    "microseconds measured by those options.");
+    "Time units per simulated microsecond.  The units are either the instruction count "
+    "plus the idle count (the default) or if -sched_time is selected wall-clock "
+    "microseconds.  This option value scales all of the -sched_*_us values as it "
+    "converts time units into the simulated microseconds measured by those options.");
 
 droption_t<uint64_t> op_sched_migration_threshold_us(
     DROPTION_SCOPE_ALL, "sched_migration_threshold_us", 500,
@@ -1035,7 +1036,7 @@ droption_t<uint64_t> op_sched_rebalance_period_us(
     "to redistribute load.");
 
 droption_t<double> op_sched_exit_if_fraction_inputs_left(
-    DROPTION_SCOPE_FRONTEND, "sched_exit_if_fraction_inputs_left", 0.05,
+    DROPTION_SCOPE_FRONTEND, "sched_exit_if_fraction_inputs_left", 0.1,
     "Exit if non-EOF inputs left are <= this fraction of the total",
     "Applies to -core_sharded and -core_serial.  When an input reaches EOF, if the "
     "number of non-EOF inputs left as a fraction of the original inputs is equal to or "
diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp
index 16fb96798bf..072f92c622f 100644
--- a/clients/drcachesim/scheduler/scheduler.cpp
+++ b/clients/drcachesim/scheduler/scheduler.cpp
@@ -2721,8 +2721,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::pop_from_ready_queue_hold_locks(
             if (res->blocked_time > 0 &&
                 // cur_time can be 0 at initialization time.
                 (cur_time == 0 ||
-                 // XXX i#6966: We have seen wall-clock time go backward, which
-                 // underflows here and then always unblocks the input.
+                 // Guard against time going backward (happens for wall-clock: i#6966).
+                 cur_time < res->blocked_start_time ||
                  cur_time - res->blocked_start_time < res->blocked_time)) {
                 VPRINT(this, 4, "pop queue: %d still blocked for %" PRIu64 "\n",
                        res->index,
@@ -2733,6 +2733,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::pop_from_ready_queue_hold_locks(
                 // This input is no longer blocked.
                 res->blocked_time = 0;
                 res->unscheduled = false;
+                VPRINT(this, 4, "pop queue: %d @ %" PRIu64 " no longer blocked\n",
+                       res->index, cur_time);
                 // We've found a candidate.  One final check if this is a migration.
                 bool found_candidate = false;
                 if (from_output == for_output)
@@ -2745,7 +2747,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::pop_from_ready_queue_hold_locks(
                            from_output, for_output, cur_time, res->last_run_time,
                            cur_time - res->last_run_time,
                            options_.migration_threshold_us);
-                    // Guard against time going backward, which happens: i#6966.
+                    // Guard against time going backward (happens for wall-clock: i#6966).
                     if (options_.migration_threshold_us == 0 || res->last_run_time == 0 ||
                         (cur_time > res->last_run_time &&
                          cur_time - res->last_run_time >=
@@ -2771,6 +2773,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::pop_from_ready_queue_hold_locks(
     if (res == nullptr && !blocked.empty()) {
         // Do not hand out EOF thinking we're done: we still have inputs blocked
         // on i/o, so just wait and retry.
+        if (for_output != INVALID_OUTPUT_ORDINAL)
+            ++outputs_[for_output].idle_count;
         status = STATUS_IDLE;
     }
     // Re-add the ones we skipped, but without changing their counters so we preserve
@@ -3039,6 +3043,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input_as_previously(
         outputs_[output].waiting = true;
         outputs_[output].wait_start_time = get_output_time(output);
         outputs_[output].record_index->fetch_add(1, std::memory_order_release);
+        ++outputs_[output].idle_count;
         return sched_type_t::STATUS_IDLE;
     }
     index = segment.key.input;
@@ -3729,17 +3734,25 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
     // do return an error on a time smaller than an input's current start time when we
     // check for quantum end.
     if (cur_time == 0) {
-        // It's more efficient for QUANTUM_INSTRUCTIONS to get the time here instead of
-        // in get_output_time().  This also makes the two more similarly behaved with
-        // respect to blocking system calls.
-        // TODO i#6971: Use INSTRS_PER_US to replace .cur_time completely
-        // with a counter-based time, weighted appropriately for STATUS_IDLE.
-        cur_time = get_time_micros();
+        if (options_.mapping == MAP_AS_PREVIOUSLY) {
+            // XXX i#7023: We should instead store the simulator's time (whether
+            // passed in or our instr-based formula below) in the records and do away
+            // with wall-clock time for idle measurement.  Either way, we should make
+            // it clear in the docs whether the user/simulator has to pass in the
+            // time on replay.
+            cur_time = get_time_micros();
+        } else {
+            // We add 1 to avoid an invalid value of 0.
+            cur_time = 1 + outputs_[output].stream->get_output_instruction_ordinal() +
+                outputs_[output].idle_count;
+        }
     }
     // Invalid values for cur_time are checked below.
     outputs_[output].cur_time->store(cur_time, std::memory_order_release);
-    if (!outputs_[output].active->load(std::memory_order_acquire))
+    if (!outputs_[output].active->load(std::memory_order_acquire)) {
+        ++outputs_[output].idle_count;
         return sched_type_t::STATUS_IDLE;
+    }
     if (outputs_[output].waiting) {
         if (options_.mapping == MAP_AS_PREVIOUSLY &&
             outputs_[output].wait_start_time > 0) {
@@ -3752,6 +3765,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
                 VPRINT(this, 4,
                        "next_record[%d]: elapsed %" PRIu64 " < duration %" PRIu64 "\n",
                        output, now - outputs_[output].wait_start_time, duration);
+                // XXX i#7023: This should always be STATUS_IDLE, right?
                 return sched_type_t::STATUS_WAIT;
             } else
                 outputs_[output].wait_start_time = 0;
@@ -4266,6 +4280,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::eof_or_idle(output_ordinal_t output,
     if (prev_input != INVALID_INPUT_ORDINAL)
         ++outputs_[output].stats[memtrace_stream_t::SCHED_STAT_SWITCH_INPUT_TO_IDLE];
     set_cur_input(output, INVALID_INPUT_ORDINAL);
+    ++outputs_[output].idle_count;
     return sched_type_t::STATUS_IDLE;
 }
 
diff --git a/clients/drcachesim/scheduler/scheduler.h b/clients/drcachesim/scheduler/scheduler.h
index 84b0cbe31f1..2ea8b334154 100644
--- a/clients/drcachesim/scheduler/scheduler.h
+++ b/clients/drcachesim/scheduler/scheduler.h
@@ -739,8 +739,11 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
          * other parameters that are in microseconds (they all end in "_us": e.g.,
          * #quantum_duration_us) so that they operate on the right time scale for the
          * passed-in simulator time (or wall-clock microseconds if no time is passed).
+         * The default value is a rough estimate when no accurate simulated time is
+         * available: the instruction count is used in that case, and we use the
+         * instructions per microsecond for a 2GHz clock at 0.5 IPC as our default.
          */
-        double time_units_per_us = 100.;
+        double time_units_per_us = 1000.;
         /**
          * The scheduling quantum duration for preemption, in simulated microseconds,
          * for #QUANTUM_TIME.  This value is multiplied by #time_units_per_us to
@@ -815,7 +818,7 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
          * instruction count is not considered (as it is not available), use discretion
          * when raising this value on uneven inputs.
          */
-        double exit_if_fraction_inputs_left = 0.05;
+        double exit_if_fraction_inputs_left = 0.1;
         // When adding new options, also add to print_configuration().
     };
 
@@ -874,7 +877,8 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
         // diminished.
         /**
          * Advances to the next record in the stream.  Returns a status code on whether
-         * and how to continue.
+         * and how to continue.  Uses the instruction count plus idle count to this point
+         * as the time; use the variant that takes "cur_time" to instead provide a time.
          */
         virtual stream_status_t
         next_record(RecordType &record);
@@ -882,11 +886,12 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
         /**
          * Advances to the next record in the stream.  Returns a status code on whether
          * and how to continue.  Supplies the current time for #QUANTUM_TIME.  The time
-         * should be considered to be the time prior to processing the returned record.
-         * The time is unitless but needs to be a globally consistent increasing value
-         * across all output streams.  #STATUS_INVALID is returned if 0 or a value smaller
-         * than the start time of the current input's quantum is passed in when
-         * #QUANTUM_TIME and #MAP_TO_ANY_OUTPUT are specified.
+         * should be considered to be the simulated time prior to processing the returned
+         * record.  The time's units can be chosen by the caller, with
+         * #dynamorio::drmemtrace::scheduler_tmpl_t::scheduler_options_t.time_units_per_us
+         * providing the conversion to simulated microseconds.  #STATUS_INVALID is
+         * returned if 0 or a value smaller than the start time of the current input's
+         * quantum is passed in when #QUANTUM_TIME and #MAP_TO_ANY_OUTPUT are specified.
          */
         virtual stream_status_t
         next_record(RecordType &record, uint64_t cur_time);
@@ -996,6 +1001,15 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
                 return scheduler_->get_input_stream(ordinal_)->get_instruction_ordinal();
             return cur_instr_count_;
         }
+        /**
+         * Identical to get_instruction_ordinal() but ignores the
+         * #SCHEDULER_USE_INPUT_ORDINALS flag.
+         */
+        uint64_t
+        get_output_instruction_ordinal() const
+        {
+            return cur_instr_count_;
+        }
         /**
          * Returns a name for the current input stream feeding this output stream. For
          * stored offline traces, this is the base name of the trace on disk. For online
@@ -1683,6 +1697,9 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
         // Exported statistics. Currently all integers and cast to double on export.
         std::vector<int64_t> stats =
             std::vector<int64_t>(memtrace_stream_t::SCHED_STAT_TYPE_COUNT);
+        // When no simulation time is passed to us, we use the idle count plus
+        // instruction count to measure time.
+        uint64_t idle_count = 0;
     };
 
     // Used for reading as-traced schedules.
diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp
index 21f8fba7a6c..e69d23a05ee 100644
--- a/clients/drcachesim/tests/scheduler_unit_tests.cpp
+++ b/clients/drcachesim/tests/scheduler_unit_tests.cpp
@@ -2245,7 +2245,7 @@ test_synthetic_with_syscalls_precise()
     scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
                                                scheduler_t::DEPENDENCY_TIMESTAMPS,
                                                scheduler_t::SCHEDULER_DEFAULTS,
-                                               /*verbosity=*/4);
+                                               /*verbosity=*/3);
     sched_ops.blocking_switch_threshold = BLOCK_THRESHOLD;
     scheduler_t scheduler;
     if (scheduler.init(sched_inputs, 1, std::move(sched_ops)) !=
@@ -2349,7 +2349,7 @@ test_synthetic_with_syscalls_latencies()
     scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
                                                scheduler_t::DEPENDENCY_TIMESTAMPS,
                                                scheduler_t::SCHEDULER_DEFAULTS,
-                                               /*verbosity=*/4);
+                                               /*verbosity=*/3);
     // We use a mock time for a deterministic result.
     sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
     sched_ops.time_units_per_us = 1.;

From e3fbf73b787a7d80c0777a62b3bda3e5687e26c0 Mon Sep 17 00:00:00 2001
From: Connal de Souza <connaldesouza@google.com>
Date: Mon, 7 Oct 2024 17:27:25 -0400
Subject: [PATCH 03/15] Move conditions for prefetching into the prefetch()
 function (#7012)

This change allows custom prefetchers to train on cache hits in addition to
misses.  A test is added to verify the hit/miss information is now available in
the prefetcher.
---
 .../drcachesim/simulator/caching_device.cpp   |  4 +-
 clients/drcachesim/simulator/prefetcher.cpp   | 13 ++++---
 clients/drcachesim/simulator/prefetcher.h     |  5 ++-
 .../tests/drcachesim_unit_tests.cpp           | 39 ++++++++++++-------
 4 files changed, 39 insertions(+), 22 deletions(-)

diff --git a/clients/drcachesim/simulator/caching_device.cpp b/clients/drcachesim/simulator/caching_device.cpp
index 0126e0b8203..08482bd0f0b 100644
--- a/clients/drcachesim/simulator/caching_device.cpp
+++ b/clients/drcachesim/simulator/caching_device.cpp
@@ -240,8 +240,8 @@ caching_device_t::request(const memref_t &memref_in)
 
         // Issue a hardware prefetch, if any, before we remember the last tag,
         // so we remember this line and not the prefetched line.
-        if (missed && !type_is_prefetch(memref.data.type) && prefetcher_ != nullptr)
-            prefetcher_->prefetch(this, memref);
+        if (prefetcher_ != nullptr && !type_is_prefetch(memref_in.data.type))
+            prefetcher_->prefetch(this, memref, missed);
 
         if (tag + 1 <= final_tag) {
             addr_t next_addr = (tag + 1) << block_size_bits_;
diff --git a/clients/drcachesim/simulator/prefetcher.cpp b/clients/drcachesim/simulator/prefetcher.cpp
index f9ffa9950ca..04f9b08378a 100644
--- a/clients/drcachesim/simulator/prefetcher.cpp
+++ b/clients/drcachesim/simulator/prefetcher.cpp
@@ -48,13 +48,16 @@ prefetcher_t::prefetcher_t(int block_size)
 }
 
 void
-prefetcher_t::prefetch(caching_device_t *cache, const memref_t &memref_in)
+prefetcher_t::prefetch(caching_device_t *cache, const memref_t &memref_in,
+                       const bool missed)
 {
     // We implement a simple next-line prefetcher.
-    memref_t memref = memref_in;
-    memref.data.addr += block_size_;
-    memref.data.type = TRACE_TYPE_HARDWARE_PREFETCH;
-    cache->request(memref);
+    if (missed) {
+        memref_t memref = memref_in;
+        memref.data.addr += block_size_;
+        memref.data.type = TRACE_TYPE_HARDWARE_PREFETCH;
+        cache->request(memref);
+    }
 }
 } // namespace drmemtrace
 } // namespace dynamorio
diff --git a/clients/drcachesim/simulator/prefetcher.h b/clients/drcachesim/simulator/prefetcher.h
index aa960f7b7d5..5729ae2c38a 100644
--- a/clients/drcachesim/simulator/prefetcher.h
+++ b/clients/drcachesim/simulator/prefetcher.h
@@ -50,8 +50,11 @@ class prefetcher_t {
     virtual ~prefetcher_t()
     {
     }
+    // prefetch() will be called for all demand accesses, even those that
+    // hit in the cache. The missed parameter indicates whether the
+    // memref.data.addr is already in the cache or not.
     virtual void
-    prefetch(caching_device_t *cache, const memref_t &memref);
+    prefetch(caching_device_t *cache, const memref_t &memref, bool missed);
 
 protected:
     int block_size_;
diff --git a/clients/drcachesim/tests/drcachesim_unit_tests.cpp b/clients/drcachesim/tests/drcachesim_unit_tests.cpp
index eae89fa0218..a0b8878f1cd 100644
--- a/clients/drcachesim/tests/drcachesim_unit_tests.cpp
+++ b/clients/drcachesim/tests/drcachesim_unit_tests.cpp
@@ -313,13 +313,6 @@ unit_test_nextline_prefetcher()
 
 class next2line_prefetcher_factory_t : public prefetcher_factory_t {
 public:
-    prefetcher_t *
-    create_prefetcher(int block_size) override
-    {
-        return new next2line_prefetcher_t(block_size);
-    }
-
-private:
     class next2line_prefetcher_t : public prefetcher_t {
     public:
         next2line_prefetcher_t(int block_size)
@@ -327,17 +320,33 @@ class next2line_prefetcher_factory_t : public prefetcher_factory_t {
         {
         }
         void
-        prefetch(caching_device_t *cache, const memref_t &memref_in)
+        prefetch(caching_device_t *cache, const memref_t &memref_in, const bool missed)
         {
             // We implement a simple 2 next-line prefetcher.
-            memref_t memref = memref_in;
-            memref.data.addr += block_size_;
-            memref.data.type = TRACE_TYPE_HARDWARE_PREFETCH;
-            cache->request(memref);
-            memref.data.addr += block_size_;
-            cache->request(memref);
+            // We also track whether inputs are hits or misses for testing.
+            if (missed) {
+                misses_++;
+                memref_t memref = memref_in;
+                memref.data.addr += block_size_;
+                memref.data.type = TRACE_TYPE_HARDWARE_PREFETCH;
+                cache->request(memref);
+                memref.data.addr += block_size_;
+                cache->request(memref);
+            } else {
+                hits_++;
+            }
         }
+        int hits_ = 0;
+        int misses_ = 0;
     };
+
+    next2line_prefetcher_t *
+    create_prefetcher(int block_size) override
+    {
+        prefetcher_ = new next2line_prefetcher_t(block_size);
+        return prefetcher_;
+    }
+    next2line_prefetcher_t *prefetcher_;
 };
 
 void
@@ -361,6 +370,8 @@ unit_test_custom_prefetcher()
 
     assert(nextline_cache_sim.get_cache_metric(metric_name_t::MISSES, 1, 0) ==
            EXPECTED_MISSES_NEXT2LINE_PREFETCHER);
+    assert(next2line_prefetcher_factory.prefetcher_->hits_ == 4);
+    assert(next2line_prefetcher_factory.prefetcher_->misses_ == 2);
 }
 void
 unit_test_child_hits()

From a15656a0c47eb9fd941ac572d44116e24ff51665 Mon Sep 17 00:00:00 2001
From: Derek Bruening <bruening@google.com>
Date: Wed, 9 Oct 2024 10:30:18 -0400
Subject: [PATCH 04/15] i#7025: Restore msgbox_mask package defaults (#7030)

Replaces the AUTOMATED_TESTING set in package builds by PR #5769 with a
new DISABLE_ZLIB CMake option to accomplish the goal of disabling the
zlib found on these VMs while avoiding turning off msgbox_mask. The
disabling by default of msgbox_mask in packages has caused many users to
fail to obtain error information and has led to confusion with silent
errors.

Tested: built a cronbuild based on this branch and downloaded it to the
Windows VM used for #7024.
I ran this:
```
C:\Users\bruening\DynamoRIO-Windows-10.93.20004>bin64\drrun -- msg
<Application C:\Windows\system32\msg.exe (3892).  Out of memory.  Program aborted.  Source I, type 0x0000000000000001, code 0x00000000c000001c.>
```

And confirmed a messagebox popped up: thus showing that `-msgbox_mask`
is *not* set to 0 anymore.

Issue: #7025
---
 CMakeLists.txt     | 5 +++--
 make/package.cmake | 9 +++++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ba527facc36..c0c696c415d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 # **********************************************************
-# Copyright (c) 2010-2023 Google, Inc.    All rights reserved.
+# Copyright (c) 2010-2024 Google, Inc.    All rights reserved.
 # Copyright (c) 2009-2010 VMware, Inc.    All rights reserved.
 # Copyright (c) 2018 Arm Limited          All rights reserved.
 # **********************************************************
@@ -1840,7 +1840,8 @@ endfunction ()
 # TODO i#5767: Install an explicit zlib package on our Windows GA CI images
 # (this find_package finds a strawberry perl zlib which causes 32-bit build
 # and 64-bit private loader issues).
-if (WIN32 AND AUTOMATED_TESTING)
+option(DISABLE_ZLIB "Disable looking for and using zlib" OFF)
+if (WIN32 AND NOT DISABLE_ZLIB)
   set(ZLIB_FOUND OFF)
 else ()
   find_package(ZLIB)
diff --git a/make/package.cmake b/make/package.cmake
index 67ae451db0e..d6585d8c343 100644
--- a/make/package.cmake
+++ b/make/package.cmake
@@ -1,5 +1,5 @@
 # **********************************************************
-# Copyright (c) 2011-2022 Google, Inc.    All rights reserved.
+# Copyright (c) 2011-2024 Google, Inc.    All rights reserved.
 # Copyright (c) 2009-2010 VMware, Inc.    All rights reserved.
 # **********************************************************
 
@@ -164,10 +164,15 @@ set(base_cache "
   BUILD_NUMBER:STRING=${arg_build}
   UNIQUE_BUILD_NUMBER:STRING=${arg_ubuild}
   BUILD_PACKAGE:BOOL=ON
-  AUTOMATED_TESTING:BOOL=ON
   ${arg_cacheappend}
   ")
 
+if (WIN32)
+  # TODO i#5767: Install a working zlib package on our Windows GA CI images.
+  set(base_cache "${base_cache}
+    DISABLE_ZLIB:BOOL=ON")
+endif()
+
 # version is optional
 if (arg_version)
   set(base_cache "${base_cache}

From 32adb70e0984fff5d69fb23d6b8424997a4897ec Mon Sep 17 00:00:00 2001
From: Abhinav Anil Sharma <sharmaabhinav@google.com>
Date: Wed, 9 Oct 2024 11:21:28 -0400
Subject: [PATCH 05/15] i#5505 PT tracing: Skip interrupted thread-final
 syscall trace (#7027)

Skips dumping the PT trace for the interrupted thread-final syscall.
Syscall PT traces from interrupted thread-final syscalls like futex,
epoll_wait have been observed to not decode successfully due to
pte_bad_context in libipt. They also do not represent the correct app
behavior as they were interrupted by the detach signal. We skip dumping
them to the raw trace.

Verified on a system that supports Intel-PT that relevant tests continue
to pass:

```
The following tests passed:
	code_api|tool.drcacheoff.kernel.simple_SUDO
	code_api|tool.drcacheoff.kernel.opcode-mix_SUDO
	code_api|tool.drcacheoff.kernel.syscall-mix_SUDO
	code_api|tool.drcacheoff.kernel.invariant-checker_SUDO
```

Also verified on a large app that user+syscall PT traces gathered with
this change do not have the decode issue previously seen on the PT trace
of the interrupted last syscall.

Issue: #5505
---
 .../drcachesim/tracer/syscall_pt_trace.cpp    | 12 +++--
 clients/drcachesim/tracer/syscall_pt_trace.h  |  6 ++-
 clients/drcachesim/tracer/tracer.cpp          | 53 ++++++++++++-------
 3 files changed, 44 insertions(+), 27 deletions(-)

diff --git a/clients/drcachesim/tracer/syscall_pt_trace.cpp b/clients/drcachesim/tracer/syscall_pt_trace.cpp
index 24ebceb1f6a..5700e4abd91 100644
--- a/clients/drcachesim/tracer/syscall_pt_trace.cpp
+++ b/clients/drcachesim/tracer/syscall_pt_trace.cpp
@@ -157,7 +157,7 @@ syscall_pt_trace_t::start_syscall_pt_trace(DR_PARAM_IN int sysnum)
 }
 
 bool
-syscall_pt_trace_t::stop_syscall_pt_trace()
+syscall_pt_trace_t::stop_syscall_pt_trace(bool dump_to_trace)
 {
     ASSERT(is_initialized_, "syscall_pt_trace_t is not initialized");
     ASSERT(drcontext_ != nullptr, "drcontext_ is nullptr");
@@ -171,11 +171,13 @@ syscall_pt_trace_t::stop_syscall_pt_trace()
         return false;
     }
 
-    if (!trace_data_dump(pttracer_output_buffer_)) {
-        return false;
-    }
+    if (dump_to_trace) {
+        if (!trace_data_dump(pttracer_output_buffer_)) {
+            return false;
+        }
 
-    traced_syscall_idx_++;
+        ++traced_syscall_idx_;
+    }
     cur_recording_sysnum_ = -1;
 
     /* Reset the pttracer handle for next syscall.
diff --git a/clients/drcachesim/tracer/syscall_pt_trace.h b/clients/drcachesim/tracer/syscall_pt_trace.h
index d9af890d227..108474b8864 100644
--- a/clients/drcachesim/tracer/syscall_pt_trace.h
+++ b/clients/drcachesim/tracer/syscall_pt_trace.h
@@ -122,9 +122,11 @@ class syscall_pt_trace_t {
     bool
     start_syscall_pt_trace(DR_PARAM_IN int sysnum);
 
-    /* Stop the PT tracing for current syscall and dump the output data to one file. */
+    /* Stop the PT tracing for current syscall and dump the output data to the trace
+     * if dump_to_trace is set.
+     */
     bool
-    stop_syscall_pt_trace();
+    stop_syscall_pt_trace(bool dump_to_trace);
 
     /* Get the sysnum of current recording syscall. */
     int
diff --git a/clients/drcachesim/tracer/tracer.cpp b/clients/drcachesim/tracer/tracer.cpp
index 6d8f6cab7d1..033965b9c8a 100644
--- a/clients/drcachesim/tracer/tracer.cpp
+++ b/clients/drcachesim/tracer/tracer.cpp
@@ -1499,27 +1499,28 @@ exit_record_syscall()
 
 #ifdef BUILD_PT_TRACER
 static bool
-stop_cur_syscall_pt_trace(void *drcontext, per_thread_t *data)
+stop_cur_syscall_pt_trace(void *drcontext, per_thread_t *data, bool dump_to_trace)
 {
     int cur_recording_sysnum = data->syscall_pt_trace.get_cur_recording_sysnum();
     ASSERT(cur_recording_sysnum != INVALID_SYSNUM,
            "Routine expected to be called only when PT tracing is active.");
-    /* Write a marker to the userspace raw trace that denotes where raw2trace
-     * should decode and insert the PT trace for the system call being
-     * recorded currently. Some drmemtrace derivations may interleave the PT
-     * trace raw data with the drmemtrace user-space raw trace data (instead of
-     * outputting the PT trace data to separate files like we do here). In such
-     * cases, we want to ensure that the TRACE_MARKER_TYPE_SYSCALL_IDX does not
-     * get output before the actual PT trace data, so we output the marker when
-     * we stop and write the PT trace (instead of when we start the PT trace).
-     * Note that the order below does not matter because the actual buffer
-     * flush happens later.
-     */
-    trace_marker_type_t marker_type = TRACE_MARKER_TYPE_SYSCALL_IDX;
-    uintptr_t marker_val = data->syscall_pt_trace.get_traced_syscall_idx();
-    BUF_PTR(data->seg_base) +=
-        instru->append_marker(BUF_PTR(data->seg_base), marker_type, marker_val);
-    if (!data->syscall_pt_trace.stop_syscall_pt_trace()) {
+    if (dump_to_trace) {
+        // Write a marker to the userspace raw trace that denotes where raw2trace
+        // should decode and insert the PT trace for the system call being
+        // recorded currently. Some drmemtrace derivations may interleave the PT
+        // trace raw data with the drmemtrace user-space raw trace data (instead of
+        // outputting the PT trace data to separate files like we do here). In such
+        // cases, we want to ensure that the TRACE_MARKER_TYPE_SYSCALL_IDX does not
+        // get output before the actual PT trace data, so we output the marker when
+        // we stop and write the PT trace (instead of when we start the PT trace).
+        // Note that the order below does not matter because the actual buffer
+        // flush happens later.
+        trace_marker_type_t marker_type = TRACE_MARKER_TYPE_SYSCALL_IDX;
+        uintptr_t marker_val = data->syscall_pt_trace.get_traced_syscall_idx();
+        BUF_PTR(data->seg_base) +=
+            instru->append_marker(BUF_PTR(data->seg_base), marker_type, marker_val);
+    }
+    if (!data->syscall_pt_trace.stop_syscall_pt_trace(dump_to_trace)) {
         NOTIFY(0,
                "ERROR: Failed to stop PT tracing for syscall %d of thread "
                "T%d.\n",
@@ -1611,7 +1612,13 @@ event_pre_syscall(void *drcontext, int sysnum)
                    "Last syscall tracing wasn't stopped when we reached the next one");
             // In the release build, in case we somehow did not stop the PT tracing, we
             // try to stop it and continue.
-            if (!stop_cur_syscall_pt_trace(drcontext, data))
+            // XXX: Something didn't go as expected as the last syscall PT trace was
+            // not stopped yet. We may need to find other control points where PT
+            // tracing needs to be stopped. E.g., PT tracing for syscalls interrupted by
+            // signals may need to be stopped in main_signal_handler. Though it has
+            // not been observed yet, the traces dumped below may have issues during
+            // decoding.
+            if (!stop_cur_syscall_pt_trace(drcontext, data, /*dump_to_trace=*/true))
                 return false;
         }
 
@@ -1688,7 +1695,7 @@ event_post_syscall(void *drcontext, int sysnum)
             ASSERT(syscall_pt_trace_t::is_syscall_pt_trace_enabled(cur_recording_sysnum),
                    "Did not expect syscall tracing to be enabled for this syscall");
             // Ignore return value and try to continue in release build.
-            stop_cur_syscall_pt_trace(drcontext, data);
+            stop_cur_syscall_pt_trace(drcontext, data, /*dump_to_trace=*/true);
         } else {
             // No syscall trace is being recorded. This may be because syscall tracing
             // is not enabled for sysnum, or that we were not in tracing mode at the
@@ -1865,7 +1872,13 @@ event_thread_exit(void *drcontext)
                        "found active at detach.\n",
                        cur_recording_sysnum, dr_get_thread_id(drcontext));
                 // Ignore return value and try to continue in release build.
-                stop_cur_syscall_pt_trace(drcontext, data);
+                // We skip dumping the trace because the syscall was likely interrupted
+                // by the detach signal and does not represent the real app behavior.
+                // XXX: Can we somehow figure out how much of the PT trace we can keep?
+                // Such PT syscall traces at the thread's end have been seen to not
+                // decode successfully in libipt, particularly for syscalls like futex,
+                // and epoll_wait.
+                stop_cur_syscall_pt_trace(drcontext, data, /*dump_to_trace=*/false);
             }
         }
 #endif

From a94727353e3a867db3cec5fc22fe45c9ca979b02 Mon Sep 17 00:00:00 2001
From: Derek Bruening <bruening@google.com>
Date: Wed, 9 Oct 2024 18:09:14 -0400
Subject: [PATCH 06/15] i#7024 hook oom: Allow vm_base=0 to mean pick from OS
 (#7029)

Adds the use of "-vm_base 0" to mean to let the OS pick our vmcode base,
with no random offset applied. This was useful during diagnosing the OOM
from the CrowdStrike hook.

Issue: #7024
---
 core/heap.c     | 39 +++++++++++++++++++++++----------------
 core/optionsx.h |  9 ++++++---
 2 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/core/heap.c b/core/heap.c
index 162264f5483..168084a8c72 100644
--- a/core/heap.c
+++ b/core/heap.c
@@ -786,6 +786,8 @@ vmm_place_vmcode(vm_heap_t *vmh, /*INOUT*/ size_t *size, heap_error_code_t *erro
 {
     ptr_uint_t preferred = 0;
 #ifdef X64
+    LOG(GLOBAL, LOG_HEAP, 1, "%s: vm heap allowed range " PFX "-" PFX "\n", __FUNCTION__,
+        heap_allowable_region_start, heap_allowable_region_end);
     /* -heap_in_lower_4GB takes top priority and has already set heap_allowable_region_*.
      * Next comes -vm_base_near_app.  It will fail for -vm_size=2G, which we document.
      */
@@ -838,20 +840,24 @@ vmm_place_vmcode(vm_heap_t *vmh, /*INOUT*/ size_t *size, heap_error_code_t *erro
 
     /* Next we try the -vm_base value plus a random offset. */
     if (vmh->start_addr == NULL) {
-        /* Out of 32 bits = 12 bits are page offset, windows wastes 4 more
-         * since its allocation base is 64KB, and if we want to stay
-         * safely in say 0x20000000-0x2fffffff we're left with only 12
-         * bits of randomness - which may be too little.  On the other
-         * hand changing any of the lower 16 bits will make our bugs
-         * non-deterministic. */
-        /* Make sure we don't waste the lower bits from our random number */
-        preferred = (DYNAMO_OPTION(vm_base) +
-                     get_random_offset(DYNAMO_OPTION(vm_max_offset) /
-                                       DYNAMO_OPTION(vmm_block_size)) *
-                         DYNAMO_OPTION(vmm_block_size));
-        preferred = ALIGN_FORWARD(preferred, OS_ALLOC_GRANULARITY);
-        /* overflow check: w/ vm_base shouldn't happen so debug-only check */
-        ASSERT(!POINTER_OVERFLOW_ON_ADD(preferred, *size));
+        if (DYNAMO_OPTION(vm_base) == 0) {
+            /* Let the OS pick where. */
+        } else {
+            /* Out of 32 bits = 12 bits are page offset, windows wastes 4 more
+             * since its allocation base is 64KB, and if we want to stay
+             * safely in say 0x20000000-0x2fffffff we're left with only 12
+             * bits of randomness - which may be too little.  On the other
+             * hand changing any of the lower 16 bits will make our bugs
+             * non-deterministic. */
+            /* Make sure we don't waste the lower bits from our random number */
+            preferred = (DYNAMO_OPTION(vm_base) +
+                         get_random_offset(DYNAMO_OPTION(vm_max_offset) /
+                                           DYNAMO_OPTION(vmm_block_size)) *
+                             DYNAMO_OPTION(vmm_block_size));
+            preferred = ALIGN_FORWARD(preferred, OS_ALLOC_GRANULARITY);
+            /* overflow check: w/ vm_base shouldn't happen so debug-only check */
+            ASSERT(!POINTER_OVERFLOW_ON_ADD(preferred, *size));
+        }
         /* let's assume a single chunk is sufficient to reserve */
 #ifdef X64
         if ((byte *)preferred < heap_allowable_region_start ||
@@ -866,8 +872,9 @@ vmm_place_vmcode(vm_heap_t *vmh, /*INOUT*/ size_t *size, heap_error_code_t *erro
                 os_heap_reserve((void *)preferred, *size, error_code, true /*+x*/);
             vmh->start_addr = vmh->alloc_start;
             LOG(GLOBAL, LOG_HEAP, 1,
-                "vmm_heap_unit_init preferred=" PFX " got start_addr=" PFX "\n",
-                preferred, vmh->start_addr);
+                "vmm_heap_unit_init preferred=" PFX " size=" PIFX " => start_addr=" PFX
+                " (code=0x%08x)\n",
+                preferred, *size, vmh->start_addr, *error_code);
 #ifdef X64
         }
 #endif
diff --git a/core/optionsx.h b/core/optionsx.h
index a35a3934502..e95d757cbdf 100644
--- a/core/optionsx.h
+++ b/core/optionsx.h
@@ -1,5 +1,5 @@
 /* *******************************************************************************
- * Copyright (c) 2010-2022 Google, Inc.  All rights reserved.
+ * Copyright (c) 2010-2024 Google, Inc.  All rights reserved.
  * Copyright (c) 2011 Massachusetts Institute of Technology  All rights reserved.
  * Copyright (c) 2003-2010 VMware, Inc.  All rights reserved.
  * *******************************************************************************/
@@ -1627,8 +1627,11 @@ OPTION_DEFAULT(uint_size, vmheap_size_wow64, 128 * 1024 * 1024,
                "capacity of virtual memory region reserved for unreachable heap "
                "on WoW64 processes")
 #endif
-/* We hardcode an address in the mmap_text region here, but verify via
- * in vmk_init().
+/* We hardcode a default vmcode base address, to which we add a random offset
+ * up to vm_max_offset.
+ * If -vm_base is set to 0, and -vm_base_near_app is off, we let the OS pick
+ * the base and do not apply any offset.
+ * For the default:
  * For Linux we start higher to avoid limiting the brk (i#766), but with our
  * new default -vm_size of 0x20000000 we want to stay below our various
  * preferred addresses of 0x7xxx0000 so we keep the base plus offset plus

From 3c1b564d79af1fb57242600f986578d8485ee3af Mon Sep 17 00:00:00 2001
From: Jack Gallagher <jack.gallagher@arm.com>
Date: Thu, 10 Oct 2024 15:49:42 +0100
Subject: [PATCH 07/15] i#5036 AArch64 scatter/gather: Fix GCC 13.2 compiler
 error (#7035)

Fixes the compiler error seen on GCC 13.2:

ext/drx/scatter_gather_aarch64.c:1238:37: error:
array subscript 2 is above array bounds of 'sg_slot_t[2]' {aka 'struct
_sg_slot_t[2]'}
    [-Werror=array-bounds=]
1238 | DR_ASSERT(slot_state->pred_slots[slot].kind == SLOT_KIND_UNUSED);

In practice the code will never make an out of bounds access because the
for loop above will always terminate by breaking so `slot` is always `<
NUM_PRED_SLOTS`.

Issue: #5036
---
 ext/drx/scatter_gather_aarch64.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/ext/drx/scatter_gather_aarch64.c b/ext/drx/scatter_gather_aarch64.c
index 2a3ad603403..195196f1bc7 100644
--- a/ext/drx/scatter_gather_aarch64.c
+++ b/ext/drx/scatter_gather_aarch64.c
@@ -1235,7 +1235,8 @@ reserve_pred_register(instr_t *sg_instr, spill_slot_state_t *slot_state)
             break;
         }
     }
-    DR_ASSERT(slot_state->pred_slots[slot].kind == SLOT_KIND_UNUSED);
+    DR_ASSERT(slot < NUM_PRED_SLOTS &&
+              slot_state->pred_slots[slot].kind == SLOT_KIND_UNUSED);
 
     /* Some instructions require the predicate to be in the range p0 - p7. This includes
      * LASTB which we use to extract elements from the vector register.
@@ -1261,7 +1262,8 @@ reserve_vector_register(instr_t *sg_instr, spill_slot_state_t *slot_state)
             break;
         }
     }
-    DR_ASSERT(slot_state->vector_slots[slot].kind == SLOT_KIND_UNUSED);
+    DR_ASSERT(slot < NUM_VECTOR_SLOTS &&
+              slot_state->vector_slots[slot].kind == SLOT_KIND_UNUSED);
 
     reg_id_t min_reg = DR_REG_Z0;
     /* Skip over any registers that have already been allocated. */
@@ -1314,8 +1316,9 @@ unreserve_pred_register(void *drcontext, instrlist_t *bb, instr_t *where,
             break;
         }
     }
-    DR_ASSERT(slot_state->pred_slots[slot].kind == SLOT_KIND_SPILL);
-    DR_ASSERT(slot_state->pred_slots[slot].reg == scratch_pred);
+    DR_ASSERT(slot < NUM_PRED_SLOTS &&
+              slot_state->pred_slots[slot].kind == SLOT_KIND_SPILL);
+    DR_ASSERT(slot < NUM_PRED_SLOTS && slot_state->pred_slots[slot].reg == scratch_pred);
 
     unreserve_sve_register(drcontext, bb, where, scratch_gpr0, scratch_pred,
                            offsetof(per_thread_t, scratch_pred_spill_slots),
@@ -1337,7 +1340,8 @@ unreserve_vector_register(void *drcontext, instrlist_t *bb, instr_t *where,
             break;
         }
     }
-    DR_ASSERT(slot_state->vector_slots[slot].reg == scratch_vec);
+    DR_ASSERT(slot < NUM_VECTOR_SLOTS &&
+              slot_state->vector_slots[slot].reg == scratch_vec);
 
     unreserve_sve_register(drcontext, bb, where, scratch_gpr0, scratch_vec,
                            offsetof(per_thread_t, scratch_vector_spill_slots_aligned),

From 8e694d83f1b88e5c33c0c055b58412d70444a74e Mon Sep 17 00:00:00 2001
From: Abhinav Anil Sharma <sharmaabhinav@google.com>
Date: Thu, 10 Oct 2024 15:05:10 -0400
Subject: [PATCH 08/15] i#5505 PT tracing: Add burst PT test with interrupted
 futex (#7032)

Adds a test where one of the threads is waiting on a futex when detach
occurs. PT traces for such futex syscalls have been observed to fail in
libipt decode. We also do not want such PT traces because they do not
represent real app behavior, as the syscall was interrupted by DR's
detach signal. #7027 added logic to skip them from the written trace.

This PR adds a regression test. Unfortunately this test still does not
reproduce the original libipt decode issue that was seen on a large app.
Most errors seen were on a modified kernel and only a few on a regular
futex. But it is still useful to add this test that ensures that the
thread-final interrupted syscall is skipped.

This test also uncovers a possible transparency violation seen in the
behavior of an interrupted-and-restarted futex call, where the blocked
thread doesn't remember that it was supposed to wait on a different
futex specified by a later FUTEX_CMP_REQUEUE call than the one specified
by it in the original futex syscall.

Since the new test requires Intel-PT, verified that it passes by running
it manually locally:

```
The following tests passed:
	code_api|tool.drcacheoff.burst_syscall_pt_SUDO

The following tests passed:
	code_api|tool.drcacheoff.kernel.simple_SUDO
	code_api|tool.drcacheoff.kernel.opcode-mix_SUDO
	code_api|tool.drcacheoff.kernel.syscall-mix_SUDO
	code_api|tool.drcacheoff.kernel.invariant-checker_SUDO

```

Issue: #5505
Issue: #7034
---
 clients/drcachesim/CMakeLists.txt             |  11 +
 clients/drcachesim/tests/burst_syscall_pt.cpp | 347 ++++++++++++++++++
 .../offline-burst_syscall_pt_SUDO.templatex   |   9 +
 clients/drcachesim/tracer/tracer.cpp          |   2 +-
 suite/tests/CMakeLists.txt                    |  13 +-
 5 files changed, 380 insertions(+), 2 deletions(-)
 create mode 100644 clients/drcachesim/tests/burst_syscall_pt.cpp
 create mode 100644 clients/drcachesim/tests/offline-burst_syscall_pt_SUDO.templatex

diff --git a/clients/drcachesim/CMakeLists.txt b/clients/drcachesim/CMakeLists.txt
index 81e54281312..6d6ad8e344b 100644
--- a/clients/drcachesim/CMakeLists.txt
+++ b/clients/drcachesim/CMakeLists.txt
@@ -1086,6 +1086,17 @@ if (BUILD_TESTS)
       use_DynamoRIO_drmemtrace_tracer(tool.drcacheoff.burst_syscall_inject)
     endif ()
 
+    if (LINUX AND BUILD_PT_POST_PROCESSOR AND BUILD_PT_TRACER)
+      add_executable(tool.drcacheoff.burst_syscall_pt_SUDO tests/burst_syscall_pt.cpp)
+      configure_DynamoRIO_static(tool.drcacheoff.burst_syscall_pt_SUDO)
+      use_DynamoRIO_static_client(tool.drcacheoff.burst_syscall_pt_SUDO drmemtrace_static)
+      target_link_libraries(tool.drcacheoff.burst_syscall_pt_SUDO drmemtrace_raw2trace
+          drmemtrace_analyzer test_helpers drmemtrace_basic_counts)
+      add_win32_flags(tool.drcacheoff.burst_syscall_pt_SUDO)
+      use_DynamoRIO_drmemtrace_tracer(tool.drcacheoff.burst_syscall_pt_SUDO)
+      link_with_pthread(tool.drcacheoff.burst_syscall_pt_SUDO)
+    endif ()
+
     if (UNIX)
       if (X86 AND NOT APPLE) # This test is x86-specific.
         # uses ptrace and looks for linux-specific syscalls
diff --git a/clients/drcachesim/tests/burst_syscall_pt.cpp b/clients/drcachesim/tests/burst_syscall_pt.cpp
new file mode 100644
index 00000000000..f58dccaf9e1
--- /dev/null
+++ b/clients/drcachesim/tests/burst_syscall_pt.cpp
@@ -0,0 +1,347 @@
+/* **********************************************************
+ * Copyright (c) 2016-2024 Google, Inc.  All rights reserved.
+ * **********************************************************/
+
+/*
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * * Neither the name of Google, Inc. nor the names of its contributors may be
+ *   used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL GOOGLE, INC. OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+// This application links in drmemtrace_static and acquires a trace during
+// a "burst" of execution that includes some system call traces collected
+// using Intel-PT.
+
+// This is set globally in CMake for other tests so easier to undef here.
+#undef DR_REG_ENUM_COMPATIBILITY
+
+#include "analyzer.h"
+#include "tools/basic_counts.h"
+#include "dr_api.h"
+#include "drmemtrace/drmemtrace.h"
+#include "drmemtrace/raw2trace.h"
+#include "mock_reader.h"
+#include "raw2trace_directory.h"
+#include "scheduler.h"
+
+#include <cassert>
+#include <unistd.h>
+#include <fstream>
+#include <iostream>
+#include <linux/futex.h>
+#include <pthread.h>
+#include <string>
+#include <sys/syscall.h>
+
+namespace dynamorio {
+namespace drmemtrace {
+
+#define FATAL_ERROR(msg, ...)                               \
+    do {                                                    \
+        fprintf(stderr, "ERROR: " msg "\n", ##__VA_ARGS__); \
+        fflush(stderr);                                     \
+        exit(1);                                            \
+    } while (0)
+
+/* The futex the child waits at initially. */
+static uint32_t futex_var = 0xf00d;
+/* The futex the child is transferred to. */
+static uint32_t futex_var_other = 0x8bad;
+
+static void *
+child_futex_wait(void *)
+{
+    long res = syscall(SYS_futex, &futex_var, FUTEX_WAIT, /*#val=*/0xf00d,
+                       /*timeout=*/nullptr, /*uaddr2=*/nullptr, /*val3=*/0);
+    assert(res == 0);
+    std::cerr << "Child released from futex\n";
+    return NULL;
+}
+
+static void
+parent_futex_wake()
+{
+    /* The child would be waiting at the other futex by now.
+     * i#7034: Note that the child thread undergoes detach while it is waiting
+     * on futex_var_other. There is a bug at this point due to a possible
+     * transparency violation in DR. When the child thread restarts futex after
+     * being interrupted by DR's detach signal, it is found to resume
+     * waiting at the original futex_var instead of futex_var_other.
+     * If we modify this app to do detach after parent_futex_wake returns, then
+     * the child is found to be waiting at futex_var_other as expected.
+     */
+    uint32_t *child_waiting_at_futex = &futex_var;
+    long res = syscall(SYS_futex, child_waiting_at_futex, FUTEX_WAKE, /*#wakeup=*/1,
+                       /*timeout=*/nullptr, /*uaddr2=*/nullptr, /*val3=*/0);
+    assert(res == 1);
+}
+
+static void
+parent_futex_reque()
+{
+    long res;
+    do {
+        /* Repeat until the child is surely waiting at the futex. We'll know this
+         * when the following call returns a 1, which means the child was
+         * transferred to futex_var_other. This is to ensure that the child thread
+         * is inside the futex syscall when DR detaches.
+         */
+        res = syscall(SYS_futex, &futex_var, FUTEX_CMP_REQUEUE, /*#wakeup_max=*/0,
+                      /*#requeue_max=*/1, /*uaddr2=*/&futex_var_other, /*val3=*/0xf00d);
+        assert(res == 0 || res == 1);
+    } while (res == 0);
+}
+
+static int
+do_some_syscalls()
+{
+    getpid();
+    gettid();
+    return 1;
+}
+
+static std::string
+postprocess(void *dr_context)
+{
+    std::cerr << "Post-processing the trace\n";
+    // Get path to write the final trace to.
+    const char *raw_dir;
+    drmemtrace_status_t mem_res = drmemtrace_get_output_path(&raw_dir);
+    assert(mem_res == DRMEMTRACE_SUCCESS);
+    std::string outdir = std::string(raw_dir) + DIRSEP + "post_processed";
+
+    const char *kcore_path;
+    drmemtrace_status_t kcore_res = drmemtrace_get_kcore_path(&kcore_path);
+    assert(kcore_res == DRMEMTRACE_SUCCESS);
+
+    raw2trace_directory_t dir;
+    if (!dr_create_dir(outdir.c_str()))
+        FATAL_ERROR("Failed to create output dir.");
+    std::string dir_err = dir.initialize(raw_dir, outdir, DEFAULT_TRACE_COMPRESSION_TYPE,
+                                         /*syscall_template_file=*/"");
+    assert(dir_err.empty());
+    raw2trace_t raw2trace(dir.modfile_bytes_, dir.in_files_, dir.out_files_,
+                          dir.out_archives_, dir.encoding_file_,
+                          dir.serial_schedule_file_, dir.cpu_schedule_file_, dr_context,
+                          /*verbosity=*/0, /*worker_count=*/-1,
+                          /*alt_module_dir=*/"",
+                          /*chunk_instr_count=*/10 * 1000 * 1000, dir.in_kfiles_map_,
+                          dir.kcoredir_, /*kallsyms_path=*/"",
+                          /*syscall_template_file=*/nullptr,
+                          // We want to fail if any error is encountered.
+                          /*pt2ir_best_effort=*/false);
+    std::string error = raw2trace.do_conversion();
+    if (!error.empty())
+        FATAL_ERROR("raw2trace failed: %s\n", error.c_str());
+    uint64 decoded_syscall_count =
+        raw2trace.get_statistic(RAW2TRACE_STAT_SYSCALL_TRACES_CONVERTED);
+    // We should see atleast the getpid, gettid, and futex syscalls made by the parent.
+    if (decoded_syscall_count <= 2) {
+        std::cerr << "Incorrect decoded syscall count (found: " << decoded_syscall_count
+                  << " vs expected > 2)\n";
+    }
+    return outdir;
+}
+
+// Trace analysis tool that allows us to verify properties of the generated PT trace.
+class pt_analysis_tool_t : public analysis_tool_t {
+public:
+    pt_analysis_tool_t()
+    {
+    }
+    bool
+    process_memref(const memref_t &memref) override
+    {
+        FATAL_ERROR("Expected to use sharded mode");
+        return true;
+    }
+    bool
+    parallel_shard_supported() override
+    {
+        return true;
+    }
+    void *
+    parallel_shard_init(int shard_index, void *worker_data) override
+    {
+        auto per_shard = new per_shard_t;
+        return reinterpret_cast<void *>(per_shard);
+    }
+    bool
+    parallel_shard_exit(void *shard_data) override
+    {
+        std::lock_guard<std::mutex> guard(shard_exit_mutex_);
+        per_shard_t *shard = reinterpret_cast<per_shard_t *>(shard_data);
+        if (shard->syscall_count == 0)
+            return true;
+        // In case the child has just the one futex syscall which was skipped
+        // from the trace.
+        if (shard->syscall_count > 1 && !shard->any_syscall_had_trace) {
+            std::cerr << "No syscall had a trace\n";
+        }
+        if (shard->prev_was_futex_marker && !shard->prev_syscall_had_trace) {
+            found_final_futex_without_trace_ = true;
+        }
+        if (shard->kernel_instr_count > 0) {
+            found_some_kernel_instrs_ = true;
+        }
+        return true;
+    }
+    bool
+    parallel_shard_memref(void *shard_data, const memref_t &memref) override
+    {
+        per_shard_t *shard = reinterpret_cast<per_shard_t *>(shard_data);
+        if (memref.marker.type == TRACE_TYPE_MARKER) {
+            switch (memref.marker.marker_type) {
+            case TRACE_MARKER_TYPE_SYSCALL_TRACE_START:
+                shard->in_syscall_trace = true;
+                break;
+            case TRACE_MARKER_TYPE_SYSCALL_TRACE_END:
+                shard->in_syscall_trace = false;
+                shard->prev_syscall_had_trace = true;
+                shard->any_syscall_had_trace = true;
+                break;
+            case TRACE_MARKER_TYPE_SYSCALL:
+                ++shard->syscall_count;
+                shard->prev_syscall_had_trace = false;
+                if (memref.marker.marker_value == SYS_futex) {
+                    shard->prev_was_futex_marker = true;
+                }
+                break;
+            }
+        }
+        if (!type_is_instr(memref.data.type))
+            return true;
+        if (shard->in_syscall_trace) {
+            ++shard->kernel_instr_count;
+            return true;
+        }
+        shard->prev_was_futex_marker = false;
+        shard->prev_syscall_had_trace = false;
+        return true;
+    }
+    bool
+    print_results() override
+    {
+        if (!found_final_futex_without_trace_) {
+            std::cerr
+                << "Did not find any thread trace with final futex without PT trace\n";
+        } else {
+            std::cerr << "Found matching signature in a thread\n";
+        }
+        if (!found_some_kernel_instrs_) {
+            std::cerr << "Did not find any kernel instrs\n";
+        }
+        return true;
+    }
+
+private:
+    // Data tracked per shard.
+    struct per_shard_t {
+        bool prev_was_futex_marker = false;
+        bool prev_syscall_had_trace = false;
+        bool any_syscall_had_trace = false;
+        int syscall_count = 0;
+        bool in_syscall_trace = false;
+        int kernel_instr_count = 0;
+    };
+
+    bool found_final_futex_without_trace_ = false;
+    bool found_some_kernel_instrs_ = false;
+    std::mutex shard_exit_mutex_;
+};
+
+static bool
+run_pt_analysis(const std::string &trace_dir)
+{
+    auto pt_analysis_tool = std::unique_ptr<pt_analysis_tool_t>(new pt_analysis_tool_t());
+    std::vector<analysis_tool_t *> tools;
+    tools.push_back(pt_analysis_tool.get());
+    analyzer_t analyzer(trace_dir, &tools[0], static_cast<int>(tools.size()));
+    if (!analyzer) {
+        FATAL_ERROR("failed to initialize analyzer: %s",
+                    analyzer.get_error_string().c_str());
+    }
+    if (!analyzer.run()) {
+        FATAL_ERROR("failed to run analyzer: %s", analyzer.get_error_string().c_str());
+    }
+    if (!analyzer.print_stats()) {
+        FATAL_ERROR("failed to print stats: %s", analyzer.get_error_string().c_str());
+    }
+    return true;
+}
+
+static void
+gather_trace()
+{
+    if (setenv("DYNAMORIO_OPTIONS",
+               "-stderr_mask 0xc -client_lib ';;-offline -enable_kernel_tracing",
+               1 /*override*/) != 0)
+        std::cerr << "failed to set env var!\n";
+    dr_app_setup();
+    assert(!dr_app_running_under_dynamorio());
+    dr_app_start();
+
+    pthread_t child_thread;
+    int res = pthread_create(&child_thread, NULL, child_futex_wait, NULL);
+    assert(res == 0);
+
+    /* Ensure that the child is waiting at a futex. */
+    parent_futex_reque();
+
+    do_some_syscalls();
+
+    dr_app_stop_and_cleanup();
+
+    /* Wake up the child finally. */
+    parent_futex_wake();
+
+    pthread_join(child_thread, NULL);
+
+    return;
+}
+
+static int
+test_pt_trace(void *dr_context)
+{
+    std::string trace_dir = postprocess(dr_context);
+    if (!run_pt_analysis(trace_dir))
+        return 1;
+    return 0;
+}
+
+int
+test_main(int argc, const char *argv[])
+{
+    gather_trace();
+    void *dr_context = dr_standalone_init();
+    if (test_pt_trace(dr_context)) {
+        return 1;
+    }
+    dr_standalone_exit();
+    return 0;
+}
+
+} // namespace drmemtrace
+} // namespace dynamorio
diff --git a/clients/drcachesim/tests/offline-burst_syscall_pt_SUDO.templatex b/clients/drcachesim/tests/offline-burst_syscall_pt_SUDO.templatex
new file mode 100644
index 00000000000..840000756c6
--- /dev/null
+++ b/clients/drcachesim/tests/offline-burst_syscall_pt_SUDO.templatex
@@ -0,0 +1,9 @@
+ERROR: PT tracing for the last syscall .* of thread .* was found active at detach.
+Child released from futex
+Post-processing the trace
+Found matching signature in a thread
+Syscall mix tool results:
+  syscall count : syscall_num
+.*
+ syscall trace count : syscall_num
+.*
diff --git a/clients/drcachesim/tracer/tracer.cpp b/clients/drcachesim/tracer/tracer.cpp
index 033965b9c8a..88db853deb3 100644
--- a/clients/drcachesim/tracer/tracer.cpp
+++ b/clients/drcachesim/tracer/tracer.cpp
@@ -1868,7 +1868,7 @@ event_thread_exit(void *drcontext)
             int cur_recording_sysnum = data->syscall_pt_trace.get_cur_recording_sysnum();
             if (cur_recording_sysnum != INVALID_SYSNUM) {
                 NOTIFY(0,
-                       "ERROR: PT tracing for the last syscall %d of thread T%d was  "
+                       "ERROR: PT tracing for the last syscall %d of thread T%d was "
                        "found active at detach.\n",
                        cur_recording_sysnum, dr_get_thread_id(drcontext));
                 // Ignore return value and try to continue in release build.
diff --git a/suite/tests/CMakeLists.txt b/suite/tests/CMakeLists.txt
index 577fe842833..590c4ccf0f3 100644
--- a/suite/tests/CMakeLists.txt
+++ b/suite/tests/CMakeLists.txt
@@ -1403,7 +1403,11 @@ function(torun test key source native standalone_dr dr_ops exe_ops added_out pas
   endif ()
 
   if (DEFINED ${key}_sudo)
-    set(rundr "sudo;${rundr}")
+    if (DEFINED ${key}_nodr)
+      set(exepath "sudo;${exepath}")
+    else ()
+      set(rundr "sudo;${rundr}")
+    endif ()
   endif ()
 
   if (is_runall)
@@ -5040,6 +5044,13 @@ if (BUILD_CLIENTS)
       # kernel tracing code is not being intentionally tested.
       torunonly_drcachesim(kernel-skip-kcore_SUDO ${ci_shared_app}
                            "-offline -enable_kernel_tracing -skip_kcore_dump" "")
+
+      if (LINUX)
+        set(tool.drcacheoff.burst_syscall_pt_SUDO_nodr ON)
+        set(tool.drcacheoff.burst_syscall_pt_SUDO_sudo ON)
+        torunonly_drcacheoff(burst_syscall_pt_SUDO tool.drcacheoff.burst_syscall_pt_SUDO
+          "" "@-tool@syscall_mix" "")
+      endif ()
     endif (BUILD_PT_TRACER AND BUILD_PT_POST_PROCESSOR)
   endif (proc_supports_pt)
 

From 5f4a4c7463387ca1b48d2fbf5d8228416c2b5991 Mon Sep 17 00:00:00 2001
From: Derek Bruening <bruening@google.com>
Date: Thu, 10 Oct 2024 18:48:49 -0400
Subject: [PATCH 09/15] i#7023: Switch drmemtrace replay of idles to use record
 count (#7036)

The drmemtrace replay of idles was using wall-clock time to reproduce
the duration. A new idle replay type is added here (and made the
default) which uses the idle record count instead, leaving support for
the old format under the old type.

Idle during replay now properly returns STATUS_IDLE, rather than
STATUS_WAIT.

Tested the new idle type on the existing record-replay tests inside
scheduler_unit_tests and in core_sharded_test.

Support for the old format is considered best-effort as its use of
wall-clock time makes it not able to reproduce precise timing in any
case. Tested manually by generating an old-style record and ensuring the
new code handled it:

$ clients/bin64/drmemtrace_launcher -indir
../src/clients/drcachesim/tests/drmemtrace.threadsig.x64.tracedir
-core_sharded -cores 4 -replay_file
../build_x64_rel_tests/legacy_record.zip

Fixes #7023
---
 clients/drcachesim/scheduler/scheduler.cpp | 116 ++++++++++++---------
 clients/drcachesim/scheduler/scheduler.h   |   8 +-
 2 files changed, 75 insertions(+), 49 deletions(-)

diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp
index 072f92c622f..051cc8aead1 100644
--- a/clients/drcachesim/scheduler/scheduler.cpp
+++ b/clients/drcachesim/scheduler/scheduler.cpp
@@ -118,7 +118,8 @@ replay_file_checker_t::check(archive_istream_t *infile)
     scheduler_t::schedule_record_t record;
     bool prev_was_idle = false;
     while (infile->read(reinterpret_cast<char *>(&record), sizeof(record))) {
-        if (record.type == scheduler_t::schedule_record_t::IDLE) {
+        if (record.type == scheduler_t::schedule_record_t::IDLE ||
+            record.type == scheduler_t::schedule_record_t::IDLE_BY_COUNT) {
             if (prev_was_idle)
                 return "Error: consecutive idle records";
             prev_was_idle = true;
@@ -1305,10 +1306,17 @@ scheduler_tmpl_t<RecordType, ReaderType>::read_recorded_schedule()
             VPRINT(this, 1, "output %d empty: returning eof up front\n", i);
             set_cur_input(i, INVALID_INPUT_ORDINAL);
             outputs_[i].at_eof = true;
-        } else if (outputs_[i].record[0].type == schedule_record_t::IDLE) {
+        } else if (outputs_[i].record[0].type == schedule_record_t::IDLE ||
+                   outputs_[i].record[0].type == schedule_record_t::IDLE_BY_COUNT) {
             set_cur_input(i, INVALID_INPUT_ORDINAL);
             outputs_[i].waiting = true;
-            outputs_[i].wait_start_time = 0; // Updated on first next_record().
+            if (outputs_[i].record[0].type == schedule_record_t::IDLE) {
+                // Convert a legacy idle duration from microseconds to record counts.
+                outputs_[i].record[0].value.idle_duration =
+                    static_cast<uint64_t>(options_.time_units_per_us *
+                                          outputs_[i].record[0].value.idle_duration);
+            }
+            outputs_[i].idle_start_count = -1; // Updated on first next_record().
             VPRINT(this, 3, "output %d starting out idle\n", i);
         } else {
             assert(outputs_[i].record[0].type == schedule_record_t::DEFAULT);
@@ -2531,17 +2539,25 @@ scheduler_tmpl_t<RecordType, ReaderType>::record_schedule_segment(
     input_ordinal_t input, uint64_t start_instruction, uint64_t stop_instruction)
 {
     assert(type == schedule_record_t::VERSION || type == schedule_record_t::FOOTER ||
-           type == schedule_record_t::IDLE || inputs_[input].lock->owned_by_cur_thread());
+           // ::IDLE is a legacy type we should not see in new recordings.
+           type == schedule_record_t::IDLE_BY_COUNT ||
+           inputs_[input].lock->owned_by_cur_thread());
     // We always use the current wall-clock time, as the time stored in the prior
     // next_record() call can be out of order across outputs and lead to deadlocks.
     uint64_t timestamp = get_time_micros();
-    if (type == schedule_record_t::IDLE &&
-        outputs_[output].record.back().type == schedule_record_t::IDLE) {
+    if (type == schedule_record_t::IDLE_BY_COUNT &&
+        outputs_[output].record.back().type == schedule_record_t::IDLE_BY_COUNT) {
         // Merge.  We don't need intermediate timestamps when idle, and consecutive
         // idle records quickly balloon the file.
         return sched_type_t::STATUS_OK;
     }
-    VPRINT(this, 4,
+    if (type == schedule_record_t::IDLE_BY_COUNT) {
+        // Start prior to this idle.
+        outputs_[output].idle_start_count = outputs_[output].idle_count - 1;
+        // That is what we'll record in the value union shared w/ start_instruction.
+        assert(start_instruction == outputs_[output].idle_count - 1);
+    }
+    VPRINT(this, 3,
            "recording out=%d type=%d input=%d start=%" PRIu64 " stop=%" PRIu64
            " time=%" PRIu64 "\n",
            output, type, input, start_instruction, stop_instruction, timestamp);
@@ -2560,24 +2576,25 @@ scheduler_tmpl_t<RecordType, ReaderType>::close_schedule_segment(output_ordinal_
     assert(!outputs_[output].record.empty());
     assert(outputs_[output].record.back().type == schedule_record_t::VERSION ||
            outputs_[output].record.back().type == schedule_record_t::FOOTER ||
-           outputs_[output].record.back().type == schedule_record_t::IDLE ||
+           // ::IDLE is for legacy recordings, not new ones.
+           outputs_[output].record.back().type == schedule_record_t::IDLE_BY_COUNT ||
            input.lock->owned_by_cur_thread());
     if (outputs_[output].record.back().type == schedule_record_t::SKIP) {
         // Skips already have a final stop value.
         return sched_type_t::STATUS_OK;
     }
-    if (outputs_[output].record.back().type == schedule_record_t::IDLE) {
-        // Just like in record_schedule_segment() we use wall-clock time for recording
-        // replay timestamps.
-        uint64_t end = get_time_micros();
-        assert(end >= outputs_[output].record.back().timestamp);
+    if (outputs_[output].record.back().type == schedule_record_t::IDLE_BY_COUNT) {
+        uint64_t end_idle_count = outputs_[output].idle_count;
+        assert(outputs_[output].idle_start_count >= 0);
+        assert(end_idle_count >=
+               static_cast<uint64_t>(outputs_[output].idle_start_count));
         outputs_[output].record.back().value.idle_duration =
-            end - outputs_[output].record.back().timestamp;
+            end_idle_count - outputs_[output].idle_start_count;
         VPRINT(this, 3,
-               "close_schedule_segment: idle duration %" PRIu64 " = %" PRIu64
+               "close_schedule_segment[%d]: idle duration %" PRIu64 " = %" PRIu64
                " - %" PRIu64 "\n",
-               outputs_[output].record.back().value.idle_duration, end,
-               outputs_[output].record.back().timestamp);
+               output, outputs_[output].record.back().value.idle_duration, end_idle_count,
+               outputs_[output].idle_start_count);
         return sched_type_t::STATUS_OK;
     }
     uint64_t instr_ord = get_instr_ordinal(input);
@@ -2596,9 +2613,9 @@ scheduler_tmpl_t<RecordType, ReaderType>::close_schedule_segment(output_ordinal_
         ++instr_ord;
     }
     VPRINT(this, 3,
-           "close_schedule_segment: input=%d type=%d start=%" PRIu64 " stop=%" PRIu64
+           "close_schedule_segment[%d]: input=%d type=%d start=%" PRIu64 " stop=%" PRIu64
            "\n",
-           input.index, outputs_[output].record.back().type,
+           output, input.index, outputs_[output].record.back().type,
            outputs_[output].record.back().value.start_instruction, instr_ord);
     // Check for empty default entries, except the starter 0,0 ones.
     assert(outputs_[output].record.back().type != schedule_record_t::DEFAULT ||
@@ -2927,7 +2944,9 @@ scheduler_tmpl_t<RecordType, ReaderType>::set_cur_input(output_ordinal_t output,
             add_to_ready_queue(output, &inputs_[prev_input]);
         }
     } else if (options_.schedule_record_ostream != nullptr &&
-               outputs_[output].record.back().type == schedule_record_t::IDLE) {
+               (outputs_[output].record.back().type == schedule_record_t::IDLE ||
+                outputs_[output].record.back().type ==
+                    schedule_record_t::IDLE_BY_COUNT)) {
         input_info_t unused;
         sched_type_t::stream_status_t status = close_schedule_segment(output, unused);
         if (status != sched_type_t::STATUS_OK)
@@ -3038,19 +3057,27 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input_as_previously(
         }
         return eof_or_idle(output, outputs_[output].cur_input);
     }
-    const schedule_record_t &segment = outputs_[output].record[record_index + 1];
-    if (segment.type == schedule_record_t::IDLE) {
+    schedule_record_t &segment = outputs_[output].record[record_index + 1];
+    if (segment.type == schedule_record_t::IDLE ||
+        segment.type == schedule_record_t::IDLE_BY_COUNT) {
         outputs_[output].waiting = true;
-        outputs_[output].wait_start_time = get_output_time(output);
+        if (segment.type == schedule_record_t::IDLE) {
+            // Convert a legacy idle duration from microseconds to record counts.
+            segment.value.idle_duration = static_cast<uint64_t>(
+                options_.time_units_per_us * segment.value.idle_duration);
+        }
+        outputs_[output].idle_start_count = outputs_[output].idle_count;
         outputs_[output].record_index->fetch_add(1, std::memory_order_release);
         ++outputs_[output].idle_count;
+        VPRINT(this, 5, "%s[%d]: next replay segment idle for %" PRIu64 "\n",
+               __FUNCTION__, output, segment.value.idle_duration);
         return sched_type_t::STATUS_IDLE;
     }
     index = segment.key.input;
     VPRINT(this, 5,
-           "pick_next_input_as_previously[%d]: next replay segment in=%d (@%" PRId64
-           ") type=%d start=%" PRId64 " end=%" PRId64 "\n",
-           output, index, get_instr_ordinal(inputs_[index]), segment.type,
+           "%s[%d]: next replay segment in=%d (@%" PRId64 ") type=%d start=%" PRId64
+           " end=%" PRId64 "\n",
+           __FUNCTION__, output, index, get_instr_ordinal(inputs_[index]), segment.type,
            segment.value.start_instruction, segment.stop_instruction);
     {
         std::lock_guard<mutex_dbg_owned> lock(*inputs_[index].lock);
@@ -3404,7 +3431,9 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input(output_ordinal_t outpu
                             if (options_.schedule_record_ostream != nullptr) {
                                 sched_type_t::stream_status_t record_status =
                                     record_schedule_segment(
-                                        output, schedule_record_t::IDLE, 0, 0, 0);
+                                        output, schedule_record_t::IDLE_BY_COUNT, 0,
+                                        // Start prior to this idle.
+                                        outputs_[output].idle_count - 1, 0);
                                 if (record_status != sched_type_t::STATUS_OK)
                                     return record_status;
                             }
@@ -3734,18 +3763,9 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
     // do return an error on a time smaller than an input's current start time when we
     // check for quantum end.
     if (cur_time == 0) {
-        if (options_.mapping == MAP_AS_PREVIOUSLY) {
-            // XXX i#7023: We should instead store the simulator's time (whether
-            // passed in or our instr-based formula below) in the records and do away
-            // with wall-clock time for idle measurement.  Either way, we should make
-            // it clear in the docs whether the user/simulator has to pass in the
-            // time on replay.
-            cur_time = get_time_micros();
-        } else {
-            // We add 1 to avoid an invalid value of 0.
-            cur_time = 1 + outputs_[output].stream->get_output_instruction_ordinal() +
-                outputs_[output].idle_count;
-        }
+        // We add 1 to avoid an invalid value of 0.
+        cur_time = 1 + outputs_[output].stream->get_output_instruction_ordinal() +
+            outputs_[output].idle_count;
     }
     // Invalid values for cur_time are checked below.
     outputs_[output].cur_time->store(cur_time, std::memory_order_release);
@@ -3755,22 +3775,24 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
     }
     if (outputs_[output].waiting) {
         if (options_.mapping == MAP_AS_PREVIOUSLY &&
-            outputs_[output].wait_start_time > 0) {
+            outputs_[output].idle_start_count >= 0) {
             uint64_t duration = outputs_[output]
                                     .record[outputs_[output].record_index->load(
                                         std::memory_order_acquire)]
                                     .value.idle_duration;
-            uint64_t now = get_output_time(output);
-            if (now - outputs_[output].wait_start_time < duration) {
+            uint64_t now = outputs_[output].idle_count;
+            if (now - outputs_[output].idle_start_count < duration) {
                 VPRINT(this, 4,
                        "next_record[%d]: elapsed %" PRIu64 " < duration %" PRIu64 "\n",
-                       output, now - outputs_[output].wait_start_time, duration);
-                // XXX i#7023: This should always be STATUS_IDLE, right?
-                return sched_type_t::STATUS_WAIT;
+                       output, now - outputs_[output].idle_start_count, duration);
+                ++outputs_[output].idle_count;
+                return sched_type_t::STATUS_IDLE;
             } else
-                outputs_[output].wait_start_time = 0;
+                outputs_[output].idle_start_count = -1;
         }
-        VPRINT(this, 5, "next_record[%d]: need new input (cur=waiting)\n", output);
+        VPRINT(this, 5,
+               "next_record[%d]: need new input (cur=waiting; idles=%" PRIu64 ")\n",
+               output, outputs_[output].idle_count);
         sched_type_t::stream_status_t res = pick_next_input(output, 0);
         if (res != sched_type_t::STATUS_OK && res != sched_type_t::STATUS_SKIPPED)
             return res;
diff --git a/clients/drcachesim/scheduler/scheduler.h b/clients/drcachesim/scheduler/scheduler.h
index 2ea8b334154..9d315e50058 100644
--- a/clients/drcachesim/scheduler/scheduler.h
+++ b/clients/drcachesim/scheduler/scheduler.h
@@ -1459,6 +1459,9 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
             // Indicates that the output is idle.  The value.idle_duration field holds
             // a duration in microseconds.
             IDLE,
+            // Indicates that the output is idle.  The value.idle_duration field holds
+            // a duration as a count of idle records.
+            IDLE_BY_COUNT,
         };
         static constexpr int VERSION_CURRENT = 0;
         schedule_record_t() = default;
@@ -1493,6 +1496,7 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
             {
             }
             // For record_type_t::IDLE, the duration in microseconds of the idling.
+            // For record_type_t::IDLE_BY_COUNT, the duration as a count of idle records.
             uint64_t idle_duration;
             // Input stream ordinal of starting point, for non-IDLE types.
             uint64_t start_instruction = 0;
@@ -1692,8 +1696,8 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
         int64_t as_traced_cpuid = -1;
         // Used for MAP_AS_PREVIOUSLY with live_replay_output_count_.
         bool at_eof = false;
-        // Used for replaying wait periods.
-        uint64_t wait_start_time = 0;
+        // Used for recording and replaying idle periods.
+        int64_t idle_start_count = -1;
         // Exported statistics. Currently all integers and cast to double on export.
         std::vector<int64_t> stats =
             std::vector<int64_t>(memtrace_stream_t::SCHED_STAT_TYPE_COUNT);

From aeebb11a3c15b42ef9971f4455be6144d342e252 Mon Sep 17 00:00:00 2001
From: Derek Bruening <bruening@google.com>
Date: Fri, 11 Oct 2024 12:19:13 -0400
Subject: [PATCH 10/15] i#7028: Modify core-sharded cpu ids and timestamps
 (#7037)

In core-sharded dynamically-scheduled mode, the as-traced cpu ids and
timestamps are interleaved in a way that makes them confusing and not
useful. We change the scheduler here to replace each cpu id value with
the shard id, and replace the timestamp with a synthetic value based on
the instruction and idle counts.

Adds a test.

Fixes #7028
---
 clients/drcachesim/scheduler/scheduler.cpp    |  75 ++++++++++++
 clients/drcachesim/scheduler/scheduler.h      |  11 ++
 .../drcachesim/tests/scheduler_unit_tests.cpp | 107 +++++++++++++++++-
 3 files changed, 188 insertions(+), 5 deletions(-)

diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp
index 051cc8aead1..9ffbd78504d 100644
--- a/clients/drcachesim/scheduler/scheduler.cpp
+++ b/clients/drcachesim/scheduler/scheduler.cpp
@@ -278,6 +278,17 @@ scheduler_tmpl_t<memref_t, reader_t>::record_type_is_non_marker_header(memref_t
     return false;
 }
 
+template <>
+bool
+scheduler_tmpl_t<memref_t, reader_t>::record_type_set_marker_value(memref_t &record,
+                                                                   uintptr_t value)
+{
+    if (record.marker.type != TRACE_TYPE_MARKER)
+        return false;
+    record.marker.marker_value = value;
+    return true;
+}
+
 template <>
 bool
 scheduler_tmpl_t<memref_t, reader_t>::record_type_is_timestamp(memref_t record,
@@ -473,6 +484,17 @@ scheduler_tmpl_t<trace_entry_t, record_reader_t>::record_type_is_instr_boundary(
         !record_reader_t::record_is_pre_instr(&prev_record);
 }
 
+template <>
+bool
+scheduler_tmpl_t<trace_entry_t, record_reader_t>::record_type_set_marker_value(
+    trace_entry_t &record, uintptr_t value)
+{
+    if (record.type != TRACE_TYPE_MARKER)
+        return false;
+    record.addr = value;
+    return true;
+}
+
 template <>
 bool
 scheduler_tmpl_t<trace_entry_t, record_reader_t>::record_type_is_timestamp(
@@ -4138,6 +4160,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
     }
     VPRINT(this, 4, "next_record[%d]: from %d @%" PRId64 ": ", output, input->index,
            cur_time);
+    update_next_record(output, record);
     VDO(this, 4, print_record(record););
 
     outputs_[output].last_record = record;
@@ -4146,6 +4169,58 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
     return sched_type_t::STATUS_OK;
 }
 
+template <typename RecordType, typename ReaderType>
+void
+scheduler_tmpl_t<RecordType, ReaderType>::update_next_record(output_ordinal_t output,
+                                                             RecordType &record)
+{
+    if (options_.mapping != MAP_TO_ANY_OUTPUT && options_.mapping != MAP_AS_PREVIOUSLY)
+        return; // Nothing to do.
+    if (options_.replay_as_traced_istream != nullptr) {
+        // Do not modify MAP_TO_RECORDED_OUTPUT (turned into MAP_AS_PREVIOUSLY).
+        return;
+    }
+    // For a dynamic schedule, the as-traced cpuids and timestamps no longer
+    // apply and are just confusing (causing problems like interval analysis
+    // failures), so we replace them.
+    trace_marker_type_t type;
+    uintptr_t value;
+    if (!record_type_is_marker(record, type, value))
+        return; // Nothing to do.
+    if (type == TRACE_MARKER_TYPE_TIMESTAMP) {
+        if (outputs_[output].base_timestamp == 0) {
+            // Record the first input's first timestamp, as a base value.
+#ifndef NDEBUG
+            bool ok =
+#endif
+                record_type_is_timestamp(record, outputs_[output].base_timestamp);
+            assert(ok);
+            assert(outputs_[output].base_timestamp != 0);
+            VPRINT(this, 2, "output %d base timestamp = %zu\n", output,
+                   outputs_[output].base_timestamp);
+        }
+        uint64_t instr_ord = outputs_[output].stream->get_instruction_ordinal();
+        uint64_t idle_count = outputs_[output].idle_count;
+        uintptr_t new_time = static_cast<uintptr_t>(
+            outputs_[output].base_timestamp + (instr_ord + idle_count) / INSTRS_PER_US);
+        VPRINT(this, 4,
+               "New time in output %d: %zu from base %zu and instrs %" PRIu64
+               " idles %" PRIu64 "\n",
+               output, new_time, outputs_[output].base_timestamp, instr_ord, idle_count);
+#ifndef NDEBUG
+        bool ok =
+#endif
+            record_type_set_marker_value(record, new_time);
+        assert(ok);
+    } else if (type == TRACE_MARKER_TYPE_CPU_ID) {
+#ifndef NDEBUG
+        bool ok =
+#endif
+            record_type_set_marker_value(record, get_shard_index(output));
+        assert(ok);
+    }
+}
+
 template <typename RecordType, typename ReaderType>
 typename scheduler_tmpl_t<RecordType, ReaderType>::stream_status_t
 scheduler_tmpl_t<RecordType, ReaderType>::unread_last_record(output_ordinal_t output,
diff --git a/clients/drcachesim/scheduler/scheduler.h b/clients/drcachesim/scheduler/scheduler.h
index 9d315e50058..f72963bf63b 100644
--- a/clients/drcachesim/scheduler/scheduler.h
+++ b/clients/drcachesim/scheduler/scheduler.h
@@ -1704,6 +1704,8 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
         // When no simulation time is passed to us, we use the idle count plus
         // instruction count to measure time.
         uint64_t idle_count = 0;
+        // The first timestamp (pre-update_next_record()) seen on the first input.
+        uintptr_t base_timestamp = 0;
     };
 
     // Used for reading as-traced schedules.
@@ -1754,6 +1756,9 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
         uint64_t input_count = 0;
     };
 
+    // We assume a 2GHz clock and IPC=1.
+    static constexpr uint64_t INSTRS_PER_US = 2000;
+
     // Called just once at initialization time to set the initial input-to-output
     // mappings and state.
     scheduler_status_t
@@ -1950,6 +1955,9 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
     bool
     record_type_is_timestamp(RecordType record, uintptr_t &value);
 
+    bool
+    record_type_set_marker_value(RecordType &record, uintptr_t value);
+
     bool
     record_type_is_invalid(RecordType record);
 
@@ -1975,6 +1983,9 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
     void
     insert_switch_tid_pid(input_info_t &input);
 
+    void
+    update_next_record(output_ordinal_t output, RecordType &record);
+
     // Used for diagnostics: prints record fields to stderr.
     void
     print_record(const RecordType &record);
diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp
index e69d23a05ee..9b6ed19e188 100644
--- a/clients/drcachesim/tests/scheduler_unit_tests.cpp
+++ b/clients/drcachesim/tests/scheduler_unit_tests.cpp
@@ -38,6 +38,7 @@
 #include <cstddef>
 #include <cstring>
 #include <iostream>
+#include <random>
 #include <set>
 #include <string>
 #include <thread>
@@ -3294,7 +3295,8 @@ test_replay_skip()
             case 1:
                 assert(memref.marker.type == TRACE_TYPE_MARKER);
                 assert(memref.marker.marker_type == TRACE_MARKER_TYPE_CPU_ID);
-                assert(memref.marker.marker_value == 1);
+                // The value should be replaced by the shard id == 0.
+                assert(memref.marker.marker_value == 0);
                 break;
             case 2:
                 assert(type_is_instr(memref.instr.type));
@@ -3308,12 +3310,14 @@ test_replay_skip()
             case 4:
                 assert(memref.marker.type == TRACE_TYPE_MARKER);
                 assert(memref.marker.marker_type == TRACE_MARKER_TYPE_TIMESTAMP);
-                assert(memref.marker.marker_value == 40);
+                // The value should be replaced by a synthetic value: the initial (10)
+                // won't have advanced to the next microsecond.
+                assert(memref.marker.marker_value == 10);
                 break;
             case 5:
                 assert(memref.marker.type == TRACE_TYPE_MARKER);
                 assert(memref.marker.marker_type == TRACE_MARKER_TYPE_CPU_ID);
-                assert(memref.marker.marker_value == 4);
+                assert(memref.marker.marker_value == 0);
                 break;
             case 6:
                 assert(type_is_instr(memref.instr.type));
@@ -3322,12 +3326,12 @@ test_replay_skip()
             case 7:
                 assert(memref.marker.type == TRACE_TYPE_MARKER);
                 assert(memref.marker.marker_type == TRACE_MARKER_TYPE_TIMESTAMP);
-                assert(memref.marker.marker_value == 50);
+                assert(memref.marker.marker_value == 10);
                 break;
             case 8:
                 assert(memref.marker.type == TRACE_TYPE_MARKER);
                 assert(memref.marker.marker_type == TRACE_MARKER_TYPE_CPU_ID);
-                assert(memref.marker.marker_value == 5);
+                assert(memref.marker.marker_value == 0);
                 break;
             case 9:
                 assert(type_is_instr(memref.instr.type));
@@ -6112,6 +6116,98 @@ test_exit_early()
     }
 }
 
+static void
+test_marker_updates()
+{
+    std::cerr << "\n----------------\nTesting marker updates\n";
+    static constexpr int NUM_INPUTS = 5;
+    static constexpr int NUM_OUTPUTS = 3;
+    // We need at least enough instrs to cover INSTRS_PER_US==2000.
+    static constexpr int NUM_INSTRS = 10000;
+    static constexpr memref_tid_t TID_BASE = 100;
+    static constexpr uint64_t TIMESTAMP_BASE = 12340000;
+
+    std::vector<trace_entry_t> inputs[NUM_INPUTS];
+
+    std::minstd_rand rand_gen;
+    rand_gen.seed(static_cast<int>(reinterpret_cast<int64_t>(&inputs[0])));
+
+    for (int i = 0; i < NUM_INPUTS; i++) {
+        memref_tid_t tid = TID_BASE + i;
+        inputs[i].push_back(make_thread(tid));
+        inputs[i].push_back(make_pid(1));
+        inputs[i].push_back(make_version(TRACE_ENTRY_VERSION));
+        // Add a randomly-increasing-value timestamp.
+        uint64_t cur_timestamp = TIMESTAMP_BASE;
+        cur_timestamp += rand_gen();
+        inputs[i].push_back(make_timestamp(cur_timestamp));
+        // Add a cpuid with a random value.
+        inputs[i].push_back(make_marker(TRACE_MARKER_TYPE_CPU_ID, rand_gen()));
+        for (int j = 0; j < NUM_INSTRS; j++) {
+            inputs[i].push_back(make_instr(42 + j * 4));
+            // Add a randomly-increasing-value timestamp.
+            cur_timestamp += rand_gen();
+            inputs[i].push_back(make_timestamp(cur_timestamp));
+            // Add a cpuid with a random value.
+            inputs[i].push_back(make_marker(TRACE_MARKER_TYPE_CPU_ID, rand_gen()));
+        }
+        inputs[i].push_back(make_exit(tid));
+    }
+    std::vector<scheduler_t::input_workload_t> sched_inputs;
+    for (int i = 0; i < NUM_INPUTS; i++) {
+        std::vector<scheduler_t::input_reader_t> readers;
+        readers.emplace_back(std::unique_ptr<mock_reader_t>(new mock_reader_t(inputs[i])),
+                             std::unique_ptr<mock_reader_t>(new mock_reader_t()),
+                             TID_BASE + i);
+        sched_inputs.emplace_back(std::move(readers));
+    }
+    scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
+                                               scheduler_t::DEPENDENCY_IGNORE,
+                                               scheduler_t::SCHEDULER_DEFAULTS,
+                                               /*verbosity=*/2);
+    scheduler_t scheduler;
+    if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
+        scheduler_t::STATUS_SUCCESS)
+        assert(false);
+    std::vector<scheduler_t::stream_t *> outputs(NUM_OUTPUTS, nullptr);
+    std::vector<uintptr_t> last_timestamp(NUM_OUTPUTS, 0);
+    std::vector<bool> eof(NUM_OUTPUTS, false);
+    for (int i = 0; i < NUM_OUTPUTS; i++)
+        outputs[i] = scheduler.get_stream(i);
+    int num_eof = 0;
+    while (num_eof < NUM_OUTPUTS) {
+        for (int i = 0; i < NUM_OUTPUTS; i++) {
+            if (eof[i])
+                continue;
+            memref_t memref;
+            scheduler_t::stream_status_t status = outputs[i]->next_record(memref);
+            if (status == scheduler_t::STATUS_EOF) {
+                ++num_eof;
+                eof[i] = true;
+                continue;
+            }
+            if (status == scheduler_t::STATUS_IDLE)
+                continue;
+            assert(status == scheduler_t::STATUS_OK);
+            if (memref.marker.type != TRACE_TYPE_MARKER)
+                continue;
+            // Make sure the random values have some order now, satisfying invariants.
+            if (memref.marker.marker_type == TRACE_MARKER_TYPE_TIMESTAMP) {
+                assert(memref.marker.marker_value >= last_timestamp[i]);
+                last_timestamp[i] = memref.marker.marker_value;
+            } else if (memref.marker.marker_type == TRACE_MARKER_TYPE_CPU_ID) {
+                assert(memref.marker.marker_value ==
+                       static_cast<uintptr_t>(outputs[i]->get_shard_index()));
+            }
+        }
+    }
+    // Ensure we didn't short-circuit or exit early.
+    uint64_t instrs_seen = 0;
+    for (int i = 0; i < NUM_OUTPUTS; i++)
+        instrs_seen += outputs[i]->get_instruction_ordinal();
+    assert(instrs_seen == NUM_INPUTS * NUM_INSTRS);
+}
+
 int
 test_main(int argc, const char *argv[])
 {
@@ -6154,6 +6250,7 @@ test_main(int argc, const char *argv[])
     test_record_scheduler();
     test_rebalancing();
     test_exit_early();
+    test_marker_updates();
 
     dr_standalone_exit();
     return 0;

From a6af5794cc3ea033867d2dc6bb127323435ec0bb Mon Sep 17 00:00:00 2001
From: Derek Bruening <bruening@google.com>
Date: Fri, 11 Oct 2024 16:06:41 -0400
Subject: [PATCH 11/15] i#6938 sched migrate: Enforce migration threshold at
 the start (#7038)

Previously, a never-executed input could be moved to another output at
any time, yet was still counted as a "migration". We change that here to
consider a never-executed input to have executed at the initial
simulation time seen on an output, so it will not be migrated until that
threshold is met. An exception is the very first rebalance at init time
for the initial allocation of inputs to outputs when inputs can be
freely moved; this does not count as a migration.

Adds a unit test.

Issue: #6938
---
 clients/drcachesim/scheduler/scheduler.cpp    | 20 ++++-
 clients/drcachesim/scheduler/scheduler.h      |  2 +
 .../drcachesim/tests/scheduler_unit_tests.cpp | 90 +++++++++++++++++++
 3 files changed, 109 insertions(+), 3 deletions(-)

diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp
index 9ffbd78504d..752a140d8f3 100644
--- a/clients/drcachesim/scheduler/scheduler.cpp
+++ b/clients/drcachesim/scheduler/scheduler.cpp
@@ -2780,6 +2780,12 @@ scheduler_tmpl_t<RecordType, ReaderType>::pop_from_ready_queue_hold_locks(
                     found_candidate = true;
                 else {
                     assert(cur_time > 0 || res->last_run_time == 0);
+                    if (res->last_run_time == 0) {
+                        // For never-executed inputs we consider their last execution
+                        // to be the very first simulation time, which we can't
+                        // easily initialize until here.
+                        res->last_run_time = outputs_[from_output].initial_cur_time;
+                    }
                     VPRINT(this, 5,
                            "migration check %d to %d: cur=%" PRIu64 " last=%" PRIu64
                            " delta=%" PRId64 " vs thresh %" PRIu64 "\n",
@@ -2787,15 +2793,20 @@ scheduler_tmpl_t<RecordType, ReaderType>::pop_from_ready_queue_hold_locks(
                            cur_time - res->last_run_time,
                            options_.migration_threshold_us);
                     // Guard against time going backward (happens for wall-clock: i#6966).
-                    if (options_.migration_threshold_us == 0 || res->last_run_time == 0 ||
+                    if (options_.migration_threshold_us == 0 ||
+                        // Allow free movement for the initial load balance at init time.
+                        cur_time == 0 ||
                         (cur_time > res->last_run_time &&
                          cur_time - res->last_run_time >=
                              static_cast<uint64_t>(options_.migration_threshold_us *
                                                    options_.time_units_per_us))) {
                         VPRINT(this, 2, "migrating %d to %d\n", from_output, for_output);
                         found_candidate = true;
-                        ++outputs_[from_output]
-                              .stats[memtrace_stream_t::SCHED_STAT_MIGRATIONS];
+                        // Do not count an initial rebalance as a migration.
+                        if (cur_time > 0) {
+                            ++outputs_[from_output]
+                                  .stats[memtrace_stream_t::SCHED_STAT_MIGRATIONS];
+                        }
                     }
                 }
                 if (found_candidate)
@@ -3789,6 +3800,9 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
         cur_time = 1 + outputs_[output].stream->get_output_instruction_ordinal() +
             outputs_[output].idle_count;
     }
+    if (outputs_[output].initial_cur_time == 0) {
+        outputs_[output].initial_cur_time = cur_time;
+    }
     // Invalid values for cur_time are checked below.
     outputs_[output].cur_time->store(cur_time, std::memory_order_release);
     if (!outputs_[output].active->load(std::memory_order_acquire)) {
diff --git a/clients/drcachesim/scheduler/scheduler.h b/clients/drcachesim/scheduler/scheduler.h
index f72963bf63b..7a11b52e05e 100644
--- a/clients/drcachesim/scheduler/scheduler.h
+++ b/clients/drcachesim/scheduler/scheduler.h
@@ -1692,6 +1692,8 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
         // This is accessed by other outputs for stealing and rebalancing.
         // Indirected so we can store it in our vector.
         std::unique_ptr<std::atomic<uint64_t>> cur_time;
+        // The first simulation time passed to this output.
+        uint64_t initial_cur_time = 0;
         // Used for MAP_TO_RECORDED_OUTPUT get_output_cpuid().
         int64_t as_traced_cpuid = -1;
         // Used for MAP_AS_PREVIOUSLY with live_replay_output_count_.
diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp
index 9b6ed19e188..5c121c947cb 100644
--- a/clients/drcachesim/tests/scheduler_unit_tests.cpp
+++ b/clients/drcachesim/tests/scheduler_unit_tests.cpp
@@ -6008,6 +6008,95 @@ test_rebalancing()
     }
 }
 
+static void
+test_initial_migrate()
+{
+    std::cerr << "\n----------------\nTesting initial migrations\n";
+    // We want to ensures migration thresholds are applied to never-executed inputs.
+    static constexpr int NUM_OUTPUTS = 2;
+    static constexpr memref_tid_t TID_BASE = 100;
+    static constexpr memref_tid_t TID_A = TID_BASE + 0;
+    static constexpr memref_tid_t TID_B = TID_BASE + 1;
+    static constexpr memref_tid_t TID_C = TID_BASE + 2;
+    static constexpr uint64_t TIMESTAMP_START = 10;
+
+    // We have 3 inputs and 2 outputs. We expect a round-robin initial assignment
+    // to put A and C on output #0 and B on #1.
+    // B will finish #1 and then try to steal C from A but should fail if initial
+    // migrations have to wait for the threshold as though the input just ran
+    // right before the trace started, which is how we treat them now.
+    std::vector<trace_entry_t> refs_A = {
+        /* clang-format off */
+        make_thread(TID_A),
+        make_pid(1),
+        make_version(4),
+        make_timestamp(TIMESTAMP_START),
+        make_marker(TRACE_MARKER_TYPE_CPU_ID, 0),
+        make_instr(10),
+        make_instr(11),
+        make_instr(12),
+        make_instr(13),
+        make_instr(14),
+        make_instr(15),
+        make_exit(TID_A),
+        /* clang-format on */
+    };
+    std::vector<trace_entry_t> refs_B = {
+        /* clang-format off */
+        make_thread(TID_B),
+        make_pid(1),
+        make_version(4),
+        make_timestamp(TIMESTAMP_START),
+        make_marker(TRACE_MARKER_TYPE_CPU_ID, 0),
+        make_instr(20),
+        make_exit(TID_B),
+        /* clang-format on */
+    };
+    std::vector<trace_entry_t> refs_C = {
+        /* clang-format off */
+        make_thread(TID_C),
+        make_pid(1),
+        make_version(4),
+        make_timestamp(TIMESTAMP_START + 10),
+        make_marker(TRACE_MARKER_TYPE_CPU_ID, 0),
+        make_instr(30),
+        make_instr(31),
+        make_instr(32),
+        make_exit(TID_C),
+        /* clang-format on */
+    };
+
+    std::vector<scheduler_t::input_reader_t> readers;
+    readers.emplace_back(std::unique_ptr<mock_reader_t>(new mock_reader_t(refs_A)),
+                         std::unique_ptr<mock_reader_t>(new mock_reader_t()), TID_A);
+    readers.emplace_back(std::unique_ptr<mock_reader_t>(new mock_reader_t(refs_B)),
+                         std::unique_ptr<mock_reader_t>(new mock_reader_t()), TID_B);
+    readers.emplace_back(std::unique_ptr<mock_reader_t>(new mock_reader_t(refs_C)),
+                         std::unique_ptr<mock_reader_t>(new mock_reader_t()), TID_C);
+    std::vector<scheduler_t::input_workload_t> sched_inputs;
+    sched_inputs.emplace_back(std::move(readers));
+    scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
+                                               scheduler_t::DEPENDENCY_TIMESTAMPS,
+                                               scheduler_t::SCHEDULER_DEFAULTS,
+                                               /*verbosity=*/3);
+    scheduler_t scheduler;
+    if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
+        scheduler_t::STATUS_SUCCESS)
+        assert(false);
+    std::vector<std::string> sched_as_string =
+        run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/true);
+    // We should see zero migrations since output #1 failed to steal C from output #0.
+    static const char *const CORE0_SCHED_STRING = "...AAAAAA....CCC.";
+    static const char *const CORE1_SCHED_STRING = "...B.____________";
+    for (int i = 0; i < NUM_OUTPUTS; i++) {
+        assert(scheduler.get_stream(i)->get_schedule_statistic(
+                   memtrace_stream_t::SCHED_STAT_MIGRATIONS) == 0);
+        std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n";
+    }
+    assert(sched_as_string[0] == CORE0_SCHED_STRING);
+    assert(sched_as_string[1] == CORE1_SCHED_STRING);
+}
+
 static void
 test_exit_early()
 {
@@ -6249,6 +6338,7 @@ test_main(int argc, const char *argv[])
     test_random_schedule();
     test_record_scheduler();
     test_rebalancing();
+    test_initial_migrate();
     test_exit_early();
     test_marker_updates();
 

From 36e7538ff1f1a33bb466f8ffa8f6ca34dc4bc7b1 Mon Sep 17 00:00:00 2001
From: Derek Bruening <bruening@google.com>
Date: Wed, 16 Oct 2024 00:47:53 -0400
Subject: [PATCH 12/15] i#6938 sched migrate: Make initial_cur_time atomic
 (#7043)

Fixes a race in the drmemtrace scheduler by making
output_info_t.initial_cur_time atomic.

Tested on ThreadSanitizer on the internal test where this was first
reported.

Issue: #6938
---
 clients/drcachesim/scheduler/scheduler.cpp | 7 ++++---
 clients/drcachesim/scheduler/scheduler.h   | 7 ++++++-
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp
index 752a140d8f3..7b3f885777c 100644
--- a/clients/drcachesim/scheduler/scheduler.cpp
+++ b/clients/drcachesim/scheduler/scheduler.cpp
@@ -2784,7 +2784,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::pop_from_ready_queue_hold_locks(
                         // For never-executed inputs we consider their last execution
                         // to be the very first simulation time, which we can't
                         // easily initialize until here.
-                        res->last_run_time = outputs_[from_output].initial_cur_time;
+                        res->last_run_time = outputs_[from_output].initial_cur_time->load(
+                            std::memory_order_acquire);
                     }
                     VPRINT(this, 5,
                            "migration check %d to %d: cur=%" PRIu64 " last=%" PRIu64
@@ -3800,8 +3801,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
         cur_time = 1 + outputs_[output].stream->get_output_instruction_ordinal() +
             outputs_[output].idle_count;
     }
-    if (outputs_[output].initial_cur_time == 0) {
-        outputs_[output].initial_cur_time = cur_time;
+    if (outputs_[output].initial_cur_time->load(std::memory_order_acquire) == 0) {
+        outputs_[output].initial_cur_time->store(cur_time, std::memory_order_release);
     }
     // Invalid values for cur_time are checked below.
     outputs_[output].cur_time->store(cur_time, std::memory_order_release);
diff --git a/clients/drcachesim/scheduler/scheduler.h b/clients/drcachesim/scheduler/scheduler.h
index 7a11b52e05e..e752839c1f8 100644
--- a/clients/drcachesim/scheduler/scheduler.h
+++ b/clients/drcachesim/scheduler/scheduler.h
@@ -1645,6 +1645,9 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
             cur_time =
                 std::unique_ptr<std::atomic<uint64_t>>(new std::atomic<uint64_t>());
             cur_time->store(0, std::memory_order_relaxed);
+            initial_cur_time =
+                std::unique_ptr<std::atomic<uint64_t>>(new std::atomic<uint64_t>());
+            initial_cur_time->store(0, std::memory_order_relaxed);
             record_index = std::unique_ptr<std::atomic<int>>(new std::atomic<int>());
             record_index->store(0, std::memory_order_relaxed);
         }
@@ -1693,7 +1696,9 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
         // Indirected so we can store it in our vector.
         std::unique_ptr<std::atomic<uint64_t>> cur_time;
         // The first simulation time passed to this output.
-        uint64_t initial_cur_time = 0;
+        // This is accessed by other outputs for stealing and rebalancing.
+        // Indirected so we can store it in our vector.
+        std::unique_ptr<std::atomic<uint64_t>> initial_cur_time;
         // Used for MAP_TO_RECORDED_OUTPUT get_output_cpuid().
         int64_t as_traced_cpuid = -1;
         // Used for MAP_AS_PREVIOUSLY with live_replay_output_count_.

From 5f74563a0f0e1a0606d89a9e8f57662f8bf67363 Mon Sep 17 00:00:00 2001
From: Enrico Deiana <edeiana@google.com>
Date: Wed, 16 Oct 2024 12:00:00 -0500
Subject: [PATCH 13/15] i#7031: modify marker value filter (#7033)

In some cases we want to modify the value of certain types of markers
(TRACE_MARKER_TYPE_) in the trace.
For example, we might want to avoid exposing the "as traced" CPU
schedule in
an offline trace because it might not be representative of the native
execution
of the traced program.
Hence, we want to set the value of all TRACE_MARKER_TYPE_CPU_ID in the
trace to unknown (i.e., (uintptr_t)-1 as documented).

To do so we implement `modify_marker_value_filter_t`, a new filter that
is used with
`record_filter` as follows:
```
drrun -t drmemtrace -tool record_filter -filter_modify_marker_value 3,0xffffffffffffffff,18,2048 \
-indir path/to/input/trace -outdir path/to/output/trace
```
Here we set the value of TRACE_MARKER_TYPE_CPU_ID == 3 markers to
(uintptr_t)-1 == 0xffffffffffffffff, which representes an unknown CPU,
and
TRACE_MARKER_TYPE_PAGE_SIZE == 18 markers to 0x800 == 2048 == 2k pages.

In general, this filter takes a list of pairs
<TRACE_MARKER_TYPE_,new_value>
and modifies the value of all listed TRACE_MARKER_TYPE_ markers in the
trace
with new_value.

We add a unit test `test_modify_marker_value_filter()` and an end-to-end
test
`code_api|tool.record_filter_modify_marker_value` which invokes the
`invariant_checker` and `view` tool on the filtered trace.

Fixes #7031
---
 api/docs/release.dox                          |   4 +
 clients/drcachesim/CMakeLists.txt             |   1 +
 clients/drcachesim/analyzer_multi.cpp         |   3 +-
 clients/drcachesim/common/options.cpp         |   9 ++
 clients/drcachesim/common/options.h           |   1 +
 clients/drcachesim/common/trace_entry.h       |   3 +
 ...ecord_filter_modify_marker_value.templatex |  20 ++++
 .../tests/record_filter_unit_tests.cpp        |  77 +++++++++++-
 .../tools/filter/modify_marker_value_filter.h | 111 ++++++++++++++++++
 .../drcachesim/tools/filter/record_filter.cpp |  15 ++-
 .../tools/filter/record_filter_create.h       |   5 +-
 .../tools/record_filter_launcher.cpp          |  13 +-
 clients/drcachesim/tools/view.cpp             |   9 +-
 suite/tests/CMakeLists.txt                    |  12 ++
 14 files changed, 275 insertions(+), 8 deletions(-)
 create mode 100644 clients/drcachesim/tests/record_filter_modify_marker_value.templatex
 create mode 100644 clients/drcachesim/tools/filter/modify_marker_value_filter.h

diff --git a/api/docs/release.dox b/api/docs/release.dox
index f43af9d3274..9e5a6d06697 100644
--- a/api/docs/release.dox
+++ b/api/docs/release.dox
@@ -265,6 +265,10 @@ Further non-compatibility-affecting changes include:
  - Added -trace_instr_intervals_file option to the drmemtrace trace analysis tools
    framework. The file must be in CSV format containing a <start,duration> tracing
    interval per line where start and duration are expressed in number of instructions.
+ - Added modify_marker_value_filter_t to #dynamorio::drmemtrace::record_filter_t to modify
+   the value of TRACE_MARKER_TYPE_ markers. This filter takes a list of
+   <TRACE_MARKER_TYPE_,new_value> and changes every listed marker in the trace to its
+   corresponding new_value.
 
 **************************************************
 <hr>
diff --git a/clients/drcachesim/CMakeLists.txt b/clients/drcachesim/CMakeLists.txt
index 6d6ad8e344b..f8ea8feee24 100644
--- a/clients/drcachesim/CMakeLists.txt
+++ b/clients/drcachesim/CMakeLists.txt
@@ -201,6 +201,7 @@ add_exported_library(drmemtrace_record_filter STATIC
   tools/filter/type_filter.h
   tools/filter/encodings2regdeps_filter.h
   tools/filter/func_id_filter.h
+  tools/filter/modify_marker_value_filter.h
   tools/filter/null_filter.h)
 target_link_libraries(drmemtrace_record_filter drmemtrace_simulator
   drmemtrace_schedule_file)
diff --git a/clients/drcachesim/analyzer_multi.cpp b/clients/drcachesim/analyzer_multi.cpp
index c27f231831c..47bc5ff0c58 100644
--- a/clients/drcachesim/analyzer_multi.cpp
+++ b/clients/drcachesim/analyzer_multi.cpp
@@ -339,7 +339,8 @@ record_analyzer_multi_t::create_analysis_tool_from_options(const std::string &to
             op_filter_cache_size.get_value(), op_filter_trace_types.get_value(),
             op_filter_marker_types.get_value(), op_trim_before_timestamp.get_value(),
             op_trim_after_timestamp.get_value(), op_encodings2regdeps.get_value(),
-            op_filter_func_ids.get_value(), op_verbose.get_value());
+            op_filter_func_ids.get_value(), op_modify_marker_value.get_value(),
+            op_verbose.get_value());
     }
     ERRMSG("Usage error: unsupported record analyzer type \"%s\".  Only " RECORD_FILTER
            " is supported.\n",
diff --git a/clients/drcachesim/common/options.cpp b/clients/drcachesim/common/options.cpp
index 5166a8bcd63..8e98e08ed9d 100644
--- a/clients/drcachesim/common/options.cpp
+++ b/clients/drcachesim/common/options.cpp
@@ -1112,6 +1112,15 @@ droption_t<std::string>
                        "for the listed function IDs and removes those belonging to "
                        "unlisted function IDs.");
 
+droption_t<std::string> op_modify_marker_value(
+    DROPTION_SCOPE_FRONTEND, "filter_modify_marker_value", "",
+    "Comma-separated pairs of integers representing <TRACE_MARKER_TYPE_, new_value>.",
+    "This option is for -tool " RECORD_FILTER ". It modifies the value of all listed "
+    "TRACE_MARKER_TYPE_ markers in the trace with their corresponding new_value. "
+    "The list must have an even size. Example: -filter_modify_marker_value 3,24,18,2048 "
+    "sets all TRACE_MARKER_TYPE_CPU_ID == 3 in the trace to core 24 and "
+    "TRACE_MARKER_TYPE_PAGE_SIZE == 18 to 2k.");
+
 droption_t<uint64_t> op_trim_before_timestamp(
     DROPTION_SCOPE_ALL, "trim_before_timestamp", 0, 0,
     (std::numeric_limits<uint64_t>::max)(),
diff --git a/clients/drcachesim/common/options.h b/clients/drcachesim/common/options.h
index ef24824b4d0..9e64dd920b3 100644
--- a/clients/drcachesim/common/options.h
+++ b/clients/drcachesim/common/options.h
@@ -230,6 +230,7 @@ extern dynamorio::droption::droption_t<std::string> op_filter_trace_types;
 extern dynamorio::droption::droption_t<std::string> op_filter_marker_types;
 extern dynamorio::droption::droption_t<bool> op_encodings2regdeps;
 extern dynamorio::droption::droption_t<std::string> op_filter_func_ids;
+extern dynamorio::droption::droption_t<std::string> op_modify_marker_value;
 extern dynamorio::droption::droption_t<uint64_t> op_trim_before_timestamp;
 extern dynamorio::droption::droption_t<uint64_t> op_trim_after_timestamp;
 extern dynamorio::droption::droption_t<bool> op_abort_on_invariant_error;
diff --git a/clients/drcachesim/common/trace_entry.h b/clients/drcachesim/common/trace_entry.h
index 1b97224485a..903374ddd00 100644
--- a/clients/drcachesim/common/trace_entry.h
+++ b/clients/drcachesim/common/trace_entry.h
@@ -699,6 +699,9 @@ typedef enum {
     // Values below here are available for users to use for custom markers.
 } trace_marker_type_t;
 
+// As documented in TRACE_MARKER_TYPE_CPU_ID, this value indicates an unknown CPU.
+#define INVALID_CPU_MARKER_VALUE static_cast<uintptr_t>(-1)
+
 /** Constants related to function or system call parameter tracing. */
 enum class func_trace_t : uint64_t { // VS2019 won't infer 64-bit with "enum {".
 /**
diff --git a/clients/drcachesim/tests/record_filter_modify_marker_value.templatex b/clients/drcachesim/tests/record_filter_modify_marker_value.templatex
new file mode 100644
index 00000000000..e64c4881cc2
--- /dev/null
+++ b/clients/drcachesim/tests/record_filter_modify_marker_value.templatex
@@ -0,0 +1,20 @@
+Estimation of pi is 3.142425985001098
+
+Trace invariant checks passed
+
+Output .* entries from .* entries.
+
+Output format:
+
+<--record#-> <--instr#->: <---tid---> <record details>
+
+------------------------------------------------------------
+
+           1           0: +[0-9]+ <marker: version [0-9]>
+           2           0: +[0-9]+ <marker: filetype 0x[0-9a-f]*>
+           3           0: +[0-9]+ <marker: cache line size [0-9]*>
+           4           0: +[0-9]+ <marker: chunk instruction count [0-9]*>
+           5           0: +[0-9]+ <marker: page size 2048>
+           6           0: +[0-9]+ <marker: timestamp [0-9]*>
+           7           0: +[0-9]+ <marker: tid [0-9]* on core unknown>
+.*
diff --git a/clients/drcachesim/tests/record_filter_unit_tests.cpp b/clients/drcachesim/tests/record_filter_unit_tests.cpp
index fd6d6ccf0cf..7bdab2bc5ad 100644
--- a/clients/drcachesim/tests/record_filter_unit_tests.cpp
+++ b/clients/drcachesim/tests/record_filter_unit_tests.cpp
@@ -44,6 +44,7 @@
 #include "tools/filter/type_filter.h"
 #include "tools/filter/encodings2regdeps_filter.h"
 #include "tools/filter/func_id_filter.h"
+#include "tools/filter/modify_marker_value_filter.h"
 #include "trace_entry.h"
 #include "zipfile_ostream.h"
 
@@ -600,6 +601,80 @@ test_func_id_filter()
     return true;
 }
 
+static bool
+test_modify_marker_value_filter()
+{
+    constexpr addr_t PC = 0x7f6fdd3ec360;
+    constexpr addr_t ENCODING = 0xe78948;
+    constexpr uint64_t NEW_PAGE_SIZE_MARKER_VALUE = 0x800; // 2k pages.
+    std::vector<test_case_t> entries = {
+        // Trace shard header.
+        { { TRACE_TYPE_HEADER, 0, { 0x1 } }, true, { true } },
+        { { TRACE_TYPE_MARKER, TRACE_MARKER_TYPE_VERSION, { 0x2 } }, true, { true } },
+        { { TRACE_TYPE_MARKER,
+            TRACE_MARKER_TYPE_FILETYPE,
+            { OFFLINE_FILE_TYPE_ARCH_X86_64 | OFFLINE_FILE_TYPE_ENCODINGS |
+              OFFLINE_FILE_TYPE_SYSCALL_NUMBERS | OFFLINE_FILE_TYPE_BLOCKING_SYSCALLS } },
+          true,
+          { true } },
+        { { TRACE_TYPE_THREAD, 0, { 0x4 } }, true, { true } },
+        { { TRACE_TYPE_PID, 0, { 0x5 } }, true, { true } },
+        { { TRACE_TYPE_MARKER, TRACE_MARKER_TYPE_CACHE_LINE_SIZE, { 0x6 } },
+          true,
+          { true } },
+        { { TRACE_TYPE_MARKER, TRACE_MARKER_TYPE_PAGE_SIZE, { 0x1000 } }, // 4k pages.
+          true,
+          { false } },
+        // Overwrite the value of TRACE_MARKER_TYPE_PAGE_SIZE with 0x800 == 2048 == 2k
+        // page size.
+        { { TRACE_TYPE_MARKER,
+            TRACE_MARKER_TYPE_PAGE_SIZE,
+            { NEW_PAGE_SIZE_MARKER_VALUE } },
+          false,
+          { true } },
+        { { TRACE_TYPE_MARKER, TRACE_MARKER_TYPE_TIMESTAMP, { 0x7 } }, true, { true } },
+        { { TRACE_TYPE_MARKER, TRACE_MARKER_TYPE_CPU_ID, { 0x8 } }, true, { false } },
+        // Overwrite the value of TRACE_MARKER_TYPE_CPU_ID with ((uintptr_t)-1).
+        { { TRACE_TYPE_MARKER, TRACE_MARKER_TYPE_CPU_ID, { INVALID_CPU_MARKER_VALUE } },
+          false,
+          { true } },
+        // We need at least one instruction with encodings to make record_filter output
+        // the trace.
+        { { TRACE_TYPE_ENCODING, 3, { ENCODING } }, true, { true } },
+        { { TRACE_TYPE_INSTR, 3, { PC } }, true, { true } },
+
+        { { TRACE_TYPE_FOOTER, 0, { 0x0 } }, true, { true } },
+    };
+
+    // Construct modify_marker_value_filter_t. We change TRACE_MARKER_TYPE_CPU_ID values
+    // with INVALID_CPU_MARKER_VALUE == ((uintptr_t)-1) and TRACE_MARKER_TYPE_PAGE_SIZE
+    // with 2k.
+    std::vector<uint64_t> modify_marker_value_pairs_list = { TRACE_MARKER_TYPE_CPU_ID,
+                                                             INVALID_CPU_MARKER_VALUE,
+                                                             TRACE_MARKER_TYPE_PAGE_SIZE,
+                                                             NEW_PAGE_SIZE_MARKER_VALUE };
+    std::vector<std::unique_ptr<record_filter_func_t>> filters;
+    auto modify_marker_value_filter = std::unique_ptr<record_filter_func_t>(
+        new dynamorio::drmemtrace::modify_marker_value_filter_t(
+            modify_marker_value_pairs_list));
+    if (!modify_marker_value_filter->get_error_string().empty()) {
+        fprintf(stderr, "Couldn't construct a modify_marker_value_filter %s",
+                modify_marker_value_filter->get_error_string().c_str());
+        return false;
+    }
+    filters.push_back(std::move(modify_marker_value_filter));
+
+    // Construct record_filter_t.
+    test_record_filter_t record_filter(std::move(filters), 0, /*write_archive=*/true);
+
+    // Run the test.
+    if (!process_entries_and_check_result(&record_filter, entries, 0))
+        return false;
+
+    fprintf(stderr, "test_modify_marker_value_filter passed\n");
+    return true;
+}
+
 static bool
 test_cache_and_type_filter()
 {
@@ -1450,7 +1525,7 @@ test_main(int argc, const char *argv[])
     dr_standalone_init();
     if (!test_cache_and_type_filter() || !test_chunk_update() || !test_trim_filter() ||
         !test_null_filter() || !test_wait_filter() || !test_encodings2regdeps_filter() ||
-        !test_func_id_filter())
+        !test_func_id_filter() || !test_modify_marker_value_filter())
         return 1;
     fprintf(stderr, "All done!\n");
     dr_standalone_exit();
diff --git a/clients/drcachesim/tools/filter/modify_marker_value_filter.h b/clients/drcachesim/tools/filter/modify_marker_value_filter.h
new file mode 100644
index 00000000000..7833ba76826
--- /dev/null
+++ b/clients/drcachesim/tools/filter/modify_marker_value_filter.h
@@ -0,0 +1,111 @@
+/* **********************************************************
+ * Copyright (c) 2024 Google, Inc.  All rights reserved.
+ * **********************************************************/
+
+/*
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * * Neither the name of Google, Inc. nor the names of its contributors may be
+ *   used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL VMWARE, INC. OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+#ifndef _MODIFY_MARKER_VALUE_FILTER_H_
+#define _MODIFY_MARKER_VALUE_FILTER_H_ 1
+
+#include "record_filter.h"
+#include "trace_entry.h"
+
+#include <cstring>
+#include <unordered_map>
+
+namespace dynamorio {
+namespace drmemtrace {
+
+/* This filter takes a list of <TRACE_MARKER_TYPE_,new_value> pairs and modifies the value
+ * of all listed markers in the trace with the given new_value.
+ */
+class modify_marker_value_filter_t : public record_filter_t::record_filter_func_t {
+public:
+    modify_marker_value_filter_t(std::vector<uint64_t> modify_marker_value_pairs_list)
+    {
+        size_t list_size = modify_marker_value_pairs_list.size();
+        if (list_size == 0) {
+            error_string_ = "List of <TRACE_MARKER_TYPE_,new_value> pairs is empty.";
+        } else if (list_size % 2 != 0) {
+            error_string_ = "List of <TRACE_MARKER_TYPE_,new_value> pairs is missing "
+                            "part of a pair as its size is not even";
+        } else {
+            for (size_t i = 0; i < list_size; i += 2) {
+                trace_marker_type_t marker_type =
+                    static_cast<trace_marker_type_t>(modify_marker_value_pairs_list[i]);
+                uint64_t new_value = modify_marker_value_pairs_list[i + 1];
+                // We ignore duplicate pairs and use the last pair in the list.
+                marker_to_value_map_[marker_type] = new_value;
+            }
+        }
+    }
+
+    void *
+    parallel_shard_init(memtrace_stream_t *shard_stream,
+                        bool partial_trace_filter) override
+    {
+        return nullptr;
+    }
+
+    bool
+    parallel_shard_filter(
+        trace_entry_t &entry, void *shard_data,
+        record_filter_t::record_filter_info_t &record_filter_info) override
+    {
+        trace_type_t entry_type = static_cast<trace_type_t>(entry.type);
+        // Output any trace_entry_t that's not a marker.
+        if (entry_type != TRACE_TYPE_MARKER)
+            return true;
+
+        // Check if the TRACE_TYPE_MARKER_ is in the list of markers for which we want to
+        // overwrite their value. If not, output the marker unchanged.
+        trace_marker_type_t marker_type = static_cast<trace_marker_type_t>(entry.size);
+        const auto &it = marker_to_value_map_.find(marker_type);
+        if (it == marker_to_value_map_.end())
+            return true;
+
+        // Overwrite marker value.
+        entry.addr = static_cast<addr_t>(it->second);
+
+        return true;
+    }
+
+    bool
+    parallel_shard_exit(void *shard_data) override
+    {
+        return true;
+    }
+
+private:
+    std::unordered_map<trace_marker_type_t, uint64_t> marker_to_value_map_;
+};
+
+} // namespace drmemtrace
+} // namespace dynamorio
+#endif /* _MODIFY_MARKER_VALUE_FILTER_H_ */
diff --git a/clients/drcachesim/tools/filter/record_filter.cpp b/clients/drcachesim/tools/filter/record_filter.cpp
index 9d038583b61..4a52c8b0ef8 100644
--- a/clients/drcachesim/tools/filter/record_filter.cpp
+++ b/clients/drcachesim/tools/filter/record_filter.cpp
@@ -62,6 +62,7 @@
 #include "type_filter.h"
 #include "encodings2regdeps_filter.h"
 #include "func_id_filter.h"
+#include "modify_marker_value_filter.h"
 
 #undef VPRINT
 #ifdef DEBUG
@@ -95,7 +96,9 @@ parse_string(const std::string &s, char sep = ',')
     std::vector<T> vec;
     do {
         pos = s.find(sep, at);
-        unsigned long long parsed_number = std::stoull(s.substr(at, pos));
+        // base = 0 allows to handle both decimal and hex numbers.
+        unsigned long long parsed_number =
+            std::stoull(s.substr(at, pos), nullptr, /*base = */ 0);
         // XXX: parsed_number may be truncated if T is not large enough.
         // We could check that parsed_number is within the limits of T using
         // std::numeric_limits<>::min()/max(), but this returns 0 on T that are enums,
@@ -119,7 +122,7 @@ record_filter_tool_create(const std::string &output_dir, uint64_t stop_timestamp
                           const std::string &remove_marker_types,
                           uint64_t trim_before_timestamp, uint64_t trim_after_timestamp,
                           bool encodings2regdeps, const std::string &keep_func_ids,
-                          unsigned int verbose)
+                          const std::string &modify_marker_value, unsigned int verbose)
 {
     std::vector<
         std::unique_ptr<dynamorio::drmemtrace::record_filter_t::record_filter_func_t>>
@@ -160,6 +163,14 @@ record_filter_tool_create(const std::string &output_dir, uint64_t stop_timestamp
             std::unique_ptr<dynamorio::drmemtrace::record_filter_t::record_filter_func_t>(
                 new dynamorio::drmemtrace::func_id_filter_t(keep_func_ids_list)));
     }
+    if (!modify_marker_value.empty()) {
+        std::vector<uint64_t> modify_marker_value_pairs_list =
+            parse_string<uint64_t>(modify_marker_value);
+        filter_funcs.emplace_back(
+            std::unique_ptr<dynamorio::drmemtrace::record_filter_t::record_filter_func_t>(
+                new dynamorio::drmemtrace::modify_marker_value_filter_t(
+                    modify_marker_value_pairs_list)));
+    }
 
     // TODO i#5675: Add other filters.
 
diff --git a/clients/drcachesim/tools/filter/record_filter_create.h b/clients/drcachesim/tools/filter/record_filter_create.h
index 0516d7bb713..d48513a4255 100644
--- a/clients/drcachesim/tools/filter/record_filter_create.h
+++ b/clients/drcachesim/tools/filter/record_filter_create.h
@@ -67,6 +67,9 @@ namespace drmemtrace {
  * @param[in] keep_func_ids  A comma-separated list of integers representing the
  *   function IDs related to #TRACE_MARKER_TYPE_FUNC_ID (and _ARG, _RETVAL, _RETADDR)
  *   markers to preserve in the trace, while removing all other function markers.
+ * @param[in] modify_marker_value A list of comma-separated pairs of integers representing
+ *   <TRACE_MARKER_TYPE_, new_value> to modify the value of all listed TRACE_MARKER_TYPE_
+ *   in the trace with their corresponding new_value.
  * @param[in] verbose  Verbosity level for notifications.
  */
 record_analysis_tool_t *
@@ -75,7 +78,7 @@ record_filter_tool_create(const std::string &output_dir, uint64_t stop_timestamp
                           const std::string &remove_marker_types,
                           uint64_t trim_before_timestamp, uint64_t trim_after_timestamp,
                           bool encodings2regdeps, const std::string &keep_func_ids,
-                          unsigned int verbose);
+                          const std::string &modify_marker_value, unsigned int verbose);
 
 } // namespace drmemtrace
 } // namespace dynamorio
diff --git a/clients/drcachesim/tools/record_filter_launcher.cpp b/clients/drcachesim/tools/record_filter_launcher.cpp
index e1a8bb7a481..bcc9f2c1098 100644
--- a/clients/drcachesim/tools/record_filter_launcher.cpp
+++ b/clients/drcachesim/tools/record_filter_launcher.cpp
@@ -138,6 +138,16 @@ droption_t<std::string>
                        "TRACE_MARKER_TYPE_FUNC_[ID | ARG | RETVAL | RETADDR] "
                        "markers for the listed function IDs and removed those "
                        "belonging to unlisted function IDs.");
+
+droption_t<std::string> op_modify_marker_value(
+    DROPTION_SCOPE_FRONTEND, "filter_modify_marker_value", "",
+    "Comma-separated pairs of integers representing <TRACE_MARKER_TYPE_, new_value>.",
+    "This option is for -tool record_filter. It modifies the value of all listed "
+    "TRACE_MARKER_TYPE_ markers in the trace with their corresponding new_value. "
+    "The list must have an even size. Example: -filter_modify_marker_value 3,24,18,2048 "
+    "sets all TRACE_MARKER_TYPE_CPU_ID == 3 in the trace to core 24 and "
+    "TRACE_MARKER_TYPE_PAGE_SIZE == 18 to 2k.");
+
 } // namespace
 
 int
@@ -168,7 +178,8 @@ _tmain(int argc, const TCHAR *targv[])
             op_cache_filter_size.get_value(), op_remove_trace_types.get_value(),
             op_remove_marker_types.get_value(), op_trim_before_timestamp.get_value(),
             op_trim_after_timestamp.get_value(), op_encodings2regdeps.get_value(),
-            op_filter_func_ids.get_value(), op_verbose.get_value()));
+            op_filter_func_ids.get_value(), op_modify_marker_value.get_value(),
+            op_verbose.get_value()));
     std::vector<record_analysis_tool_t *> tools;
     tools.push_back(record_filter.get());
 
diff --git a/clients/drcachesim/tools/view.cpp b/clients/drcachesim/tools/view.cpp
index 7817169c89d..702667db73c 100644
--- a/clients/drcachesim/tools/view.cpp
+++ b/clients/drcachesim/tools/view.cpp
@@ -335,8 +335,13 @@ view_t::parallel_shard_memref(void *shard_data, const memref_t &memref)
             // see a cpuid marker on a thread switch.  To avoid that assumption
             // we would want to track the prior tid and print out a thread switch
             // message whenever it changes.
-            std::cerr << "<marker: tid " << memref.marker.tid << " on core "
-                      << memref.marker.marker_value << ">\n";
+            if (memref.marker.marker_value == INVALID_CPU_MARKER_VALUE) {
+                std::cerr << "<marker: tid " << memref.marker.tid
+                          << " on core unknown>\n";
+            } else {
+                std::cerr << "<marker: tid " << memref.marker.tid << " on core "
+                          << memref.marker.marker_value << ">\n";
+            }
             break;
         case TRACE_MARKER_TYPE_KERNEL_EVENT:
             if (trace_version_ <= TRACE_ENTRY_VERSION_NO_KERNEL_PC) {
diff --git a/suite/tests/CMakeLists.txt b/suite/tests/CMakeLists.txt
index 590c4ccf0f3..e7939ddaac1 100644
--- a/suite/tests/CMakeLists.txt
+++ b/suite/tests/CMakeLists.txt
@@ -4794,6 +4794,18 @@ if (BUILD_CLIENTS)
           # We run basic_counts on the filtered trace to check that there are no function
           # related markers left.
           "basic_counts")
+
+        # Run modify_marker_value_filter to overwrite the values of
+        # TRACE_MARKER_TYPE_CPU_ID markers with (uintptr_t)-1, which represents an unknown
+        # CPU, and TRACE_MARKER_TYPE_PAGE_SIZE to 2k.
+        set(testname "tool.record_filter_modify_marker_value")
+        torun_record_filter("${testname}" ${kernel_xfer_app}
+          "record_filter_modify_marker_value"
+          "${drcachesim_path}@-simulator_type@record_filter@-filter_modify_marker_value@3,0xffffffffffffffff,18,2048@-indir@${testname}.${kernel_xfer_app}.*.dir/trace@-outdir@${testname}.filtered.dir"
+          # We run view on the filtered trace to check that the value of the first
+          # TRACE_MARKER_TYPE_CPU_ID is (uintptr_t)-1 and the value of
+          # TRACE_MARKER_TYPE_PAGE_SIZE is 2k.
+          "view")
     endif ()
 
     if (X86 AND X64 AND UNIX AND NOT APPLE)

From 1afe5eb00f27e7747042d7f5ca4acbd18784c5fe Mon Sep 17 00:00:00 2001
From: Derek Bruening <bruening@google.com>
Date: Wed, 16 Oct 2024 19:32:16 -0400
Subject: [PATCH 14/15] i#7028 mod markers: Document scheduler-modified markers
 (#7041)

Updates the documentation to explain that dynamic scheduling modifies
timestamp and cpu id markers.

Issue: #7028
---
 clients/drcachesim/docs/drcachesim.dox.in | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/clients/drcachesim/docs/drcachesim.dox.in b/clients/drcachesim/docs/drcachesim.dox.in
index 406e3293530..5cc2818c7bd 100644
--- a/clients/drcachesim/docs/drcachesim.dox.in
+++ b/clients/drcachesim/docs/drcachesim.dox.in
@@ -179,9 +179,9 @@ Some of the more important markers are:
 
 - #dynamorio::drmemtrace::TRACE_MARKER_TYPE_KERNEL_XFER - This identifies a system call that changes control flow, such as a signal return.
 
-- #dynamorio::drmemtrace::TRACE_MARKER_TYPE_TIMESTAMP - The marker value provides a timestamp for this point of the trace (in units of microseconds since Jan 1, 1601 UTC). This value can be used to synchronize records from different threads as well as analyze latencies (however, tracing overhead inflates time unevenly, so time deltas should not be considered perfectly representative). It is used in the sequential analysis of a multi-threaded trace.
+- #dynamorio::drmemtrace::TRACE_MARKER_TYPE_TIMESTAMP - The marker value provides a timestamp for this point of the trace (in units of microseconds since Jan 1, 1601 UTC). This value can be used to synchronize records from different threads as well as analyze latencies (however, tracing overhead inflates time unevenly, so time deltas should not be considered perfectly representative). It is used in the sequential analysis of a multi-threaded trace.  When dynamic scheduling is performed (see \ref sec_drcachesim_sched_dynamic), the original timestamp value is replaced with a synthetic value in order to maintain relative ordering (though the new values should not be relied upon to indicate accurate durations between events).
 
-- #dynamorio::drmemtrace::TRACE_MARKER_TYPE_CPU_ID - The marker value contains the CPU identifier on which the subsequent records were collected.  It can be used to identify the "as traced" schedule, indicating which threads were on which cores at which times.  However, this schedule is not representative and should not be treated as indicating how the application behaves without tracing.  See \ref sec_drcachesim_as_traced for further information.
+- #dynamorio::drmemtrace::TRACE_MARKER_TYPE_CPU_ID - The marker value contains the CPU identifier on which the subsequent records were collected.  It can be used to identify the "as traced" schedule, indicating which threads were on which cores at which times.  However, this schedule is not representative and should not be treated as indicating how the application behaves without tracing.  See \ref sec_drcachesim_as_traced for further information.  When dynamic scheduling is performed (see \ref sec_drcachesim_sched_dynamic), the original value of this marker is replaced with the new virtual core identifier.
 
 - #dynamorio::drmemtrace::TRACE_MARKER_TYPE_FUNC_ID, #dynamorio::drmemtrace::TRACE_MARKER_TYPE_FUNC_RETADDR, #dynamorio::drmemtrace::TRACE_MARKER_TYPE_FUNC_ARG, #dynamorio::drmemtrace::TRACE_MARKER_TYPE_FUNC_RETVAL - These markers are used to capture information about function calls.  Which functions to capture must be explicitly selected at tracing time.  Typical candiates are heap allocation and freeing functions.  See \ref sec_drcachesim_funcs.
 
@@ -1469,8 +1469,10 @@ While we suggest keeping traces stored as thread-sharded and using the dynamic s
 in each run, there is support for running the scheduler once and creating a new set of
 stored traces in core-sharded format: essentially switching to hardware-thread-oriented
 traces.  This is done using the \ref sec_tool_record_filter tool in `-core_sharded` mode.
-The #dynamorio::drmemtrace::TRACE_MARKER_TYPE_CPU_ID markers are not modified by the
-dynamic scheduler, and should be ignored in a newly created core-sharded trace.
+The #dynamorio::drmemtrace::TRACE_MARKER_TYPE_TIMESTAMP and
+#dynamorio::drmemtrace::TRACE_MARKER_TYPE_CPU_ID markers are modified by the dynamic
+scheduler to reflect the new schedule.  The new timestamps maintain relative ordering
+but should not be relied upon to indicate accurate durations between events.
 
 Traces also include markers indicating disruptions in user mode control
 flow such as signal handler entry and exit.
@@ -1535,7 +1537,9 @@ through the -cpu_schedule_file option.
 \section sec_drcachesim_sched_dynamic Dynamic Scheduling
 
 Instead of using the as-traced schedule, we recommend re-scheduling the traced software
-threads using our trace scheduler.  Our scheduler essentially serves as an operating
+threads using our trace scheduler in
+#dynamorio::drmemtrace::scheduler_t::MAP_TO_ANY_OUTPUT mode.
+Our scheduler essentially serves as an operating
 system scheduler for this purpose, though using simpler schemes.  It models separate
 runqueues per core with support for binding inputs to certain cores, priorities, idle
 time from blocking system calls, migration thresholds, rebalancing runqueues, etc.  It
@@ -1564,8 +1568,10 @@ The downsides include:
   thus, timestamp ordering will be followed at context switch points for picking the
   next input, but timestamps will not preempt an input.
 
-The #dynamorio::drmemtrace::TRACE_MARKER_TYPE_CPU_ID markers are not modified by the
-dynamic scheduler, and should be ignored in a dynamic rescheduling.
+The #dynamorio::drmemtrace::TRACE_MARKER_TYPE_TIMESTAMP and
+#dynamorio::drmemtrace::TRACE_MARKER_TYPE_CPU_ID markers are modified by the dynamic
+scheduler to reflect the new schedule.  The new timestamps maintain relative ordering
+but should not be relied upon to indicate accurate durations between events.
 
 ********************
 \section sec_drcachesim_sched_time Simulated Time

From e9a983ad3033e1e88a4756a4725dc1d60239847d Mon Sep 17 00:00:00 2001
From: Derek Bruening <bruening@google.com>
Date: Fri, 18 Oct 2024 12:36:17 -0400
Subject: [PATCH 15/15] i#6949: Enable core-sharded by default for simulators
 (#7042)

Adds a new interface trace_analysis_tool::preferred_shard_type() to the
drmemtrace framework to allow tools to request core-sharded operation.

The cache simulator, TLB simulator, and schedule_stats tools override
the new interface to request core-sharded mode.

In the launcher, if all tools prefer core-sharded, and the user did not
specify sharding (via -[no_]core_{sharded,serial} or -cpu_scheduling),
then core-sharded (or core-serial) mode is enabled, with a -verbose 1+
message.
```
  $ bin64/drrun -stderr_mask 0 -t drcachesim -indir ../src/clients/drcachesim/tests/drmemtrace.threadsig.x64.tracedir/ -verbose 1 -tool schedule_stats:cache_simulator
  Enabling -core_serial as all tools prefer it
  <...>
  Schedule stats tool results:
  Total counts:
             4 cores
             8 threads: 1257600, 1257602, 1257599, 1257603, 1257598, 1257604, 1257596, 1257601
  <...>
  Core #0 schedule: AEA_A_
  <...>
  Cache simulation results:
  Core #0 (traced CPU(s): #0)
    L1I0 (size=32768, assoc=8, block=64, LRU) stats:
      Hits:                          123,659
  <...>
```

If sharding is not specified and tools do not agree on a preferred type
(with the default being preferring thread-sharded, if a tool does not
override the new method), an error is raised to avoid confusion:
```
  $ bin64/drrun -t drcachesim -indir ../src/clients/drcachesim/tests/drmemtrace.threadsig.x64.tracedir -tool cache_simulator:basic_counts
  ERROR: failed to initialize analyzer: Selected tools differ in preferred sharding: please re-run with -[no_]core_sharded or -[no_]core_serial
```

Unfortunately, it is not easy to detect core-sharded-on-disk traces in
the launcher, so the user must now pass `-no_core_sharded` when using
such traces with core-sharded-preferring tools to avoid the trace being
re-scheduled yet again. Documentation for this is added and it is turned
into a fatal error since this re-scheduling there is almost certainly
user error.

Reduces the scheduler queue diagnostics by 5x as they seem too frequent
in short runs of the simulator with the new defaults, which new users
are going to see.

Updates the documentation proper and the options documentation to
describe the new defaults.

Updates numerous drcachesim test output templates.

Keeps a couple of simulator tests using thread-sharded by passing
-no_core_serial.

Fixes #6949
---
 api/docs/release.dox                          |  6 ++
 clients/drcachesim/analysis_tool.h            | 13 +++++
 clients/drcachesim/analyzer.cpp               | 13 +++++
 clients/drcachesim/analyzer_multi.cpp         | 56 ++++++++++++++++++-
 clients/drcachesim/common/options.cpp         | 26 +++++++--
 clients/drcachesim/docs/drcachesim.dox.in     | 19 +++++--
 clients/drcachesim/scheduler/scheduler.cpp    |  9 ++-
 .../drcachesim/simulator/cache_simulator.cpp  |  3 +-
 clients/drcachesim/simulator/simulator.cpp    |  7 ++-
 clients/drcachesim/simulator/simulator.h      | 10 +++-
 .../drcachesim/simulator/tlb_simulator.cpp    |  3 +-
 .../tests/offline-burst_client.templatex      |  8 +--
 .../tests/offline-burst_maps.templatex        |  8 +--
 .../tests/offline-burst_noreach.templatex     |  8 +--
 .../tests/offline-burst_replace.templatex     |  8 +--
 .../tests/offline-burst_static.templatex      |  8 +--
 ...line-filter-and-instr-only-trace.templatex |  8 +--
 .../drcachesim/tests/offline-filter.templatex |  8 +--
 .../drcachesim/tests/offline-fork.templatex   |  8 +--
 .../tests/offline-instr-only-trace.templatex  |  8 +--
 .../tests/offline-multiproc.templatex         |  8 +--
 .../drcachesim/tests/offline-simple.templatex |  8 +--
 clients/drcachesim/tools/schedule_stats.h     |  5 ++
 suite/tests/CMakeLists.txt                    | 42 ++++++++------
 24 files changed, 216 insertions(+), 84 deletions(-)

diff --git a/api/docs/release.dox b/api/docs/release.dox
index 9e5a6d06697..a4a7d9f5049 100644
--- a/api/docs/release.dox
+++ b/api/docs/release.dox
@@ -269,6 +269,12 @@ Further non-compatibility-affecting changes include:
    the value of TRACE_MARKER_TYPE_ markers. This filter takes a list of
    <TRACE_MARKER_TYPE_,new_value> and changes every listed marker in the trace to its
    corresponding new_value.
+ - Added trace_analysis_tool::preferred_shard_type() to the drmemtrace framework to
+   allow switching to core-sharded by default if all tools prefer that mode.
+ - For the drmemtrace framework, if only core-sharded-preferring tools are enabled
+   (these include cache and TLB simulators and the schedule_stats tool), -core_sharded or
+   -core_serial is automatically turned on for offline analysis to enable more
+   representative simulated software thread scheduling onto virtual cores.
 
 **************************************************
 <hr>
diff --git a/clients/drcachesim/analysis_tool.h b/clients/drcachesim/analysis_tool.h
index 24306cd7534..c5cae5f2b55 100644
--- a/clients/drcachesim/analysis_tool.h
+++ b/clients/drcachesim/analysis_tool.h
@@ -156,6 +156,19 @@ template <typename RecordType> class analysis_tool_tmpl_t {
     {
         return "";
     }
+    /**
+     * Identifies the preferred shard type for this analysis.  This only applies when
+     * the user does not specify a shard type for a run.  In that case, if every tool
+     * being run prefers #SHARD_BY_CORE, the framework uses that mode.  If tools
+     * disagree then an error is raised.  This is ignored if the user specifies a
+     * shard type via one of -core_sharded, -core_serial, -no_core_sharded,
+     * -no_core_serial, or -cpu_scheduling.
+     */
+    virtual shard_type_t
+    preferred_shard_type()
+    {
+        return SHARD_BY_THREAD;
+    }
     /** Returns whether the tool was created successfully. */
     virtual bool
     operator!()
diff --git a/clients/drcachesim/analyzer.cpp b/clients/drcachesim/analyzer.cpp
index f2c2ddb5098..728648171fa 100644
--- a/clients/drcachesim/analyzer.cpp
+++ b/clients/drcachesim/analyzer.cpp
@@ -339,6 +339,19 @@ analyzer_tmpl_t<RecordType, ReaderType>::init_scheduler_common(
             uint64_t filetype = scheduler_.get_stream(i)->get_filetype();
             VPRINT(this, 2, "Worker %d filetype %" PRIx64 "\n", i, filetype);
             if (TESTANY(OFFLINE_FILE_TYPE_CORE_SHARDED, filetype)) {
+                if (i == 0 && shard_type_ == SHARD_BY_CORE) {
+                    // This is almost certainly user error.
+                    // Better to exit than risk user confusion.
+                    // XXX i#7045: Ideally this could be reported as an error by the
+                    // scheduler, and also detected early in analyzer_multi to auto-fix
+                    // (when no mode is specified: if the user specifies core-sharding
+                    // there could be config differences and this should be an error),
+                    // but neither is simple so today the user has to re-run.
+                    error_string_ =
+                        "Re-scheduling a core-sharded-on-disk trace is generally a "
+                        "mistake; re-run with -no_core_sharded.\n";
+                    return false;
+                }
                 shard_type_ = SHARD_BY_CORE;
             }
         }
diff --git a/clients/drcachesim/analyzer_multi.cpp b/clients/drcachesim/analyzer_multi.cpp
index 47bc5ff0c58..521ae6bf488 100644
--- a/clients/drcachesim/analyzer_multi.cpp
+++ b/clients/drcachesim/analyzer_multi.cpp
@@ -462,6 +462,7 @@ analyzer_multi_tmpl_t<RecordType, ReaderType>::analyzer_multi_tmpl_t()
             if (!error.empty()) {
                 this->success_ = false;
                 this->error_string_ = "raw2trace failed: " + error;
+                return;
             }
         }
     }
@@ -473,8 +474,54 @@ analyzer_multi_tmpl_t<RecordType, ReaderType>::analyzer_multi_tmpl_t()
         return;
     }
 
+    bool sharding_specified = op_core_sharded.specified() || op_core_serial.specified() ||
+        // -cpu_scheduling implies thread-sharded.
+        op_cpu_scheduling.get_value();
+    // TODO i#7040: Add core-sharded support for online tools.
+    bool offline = !op_indir.get_value().empty() || !op_infile.get_value().empty();
+    if (offline && !sharding_specified) {
+        bool all_prefer_thread_sharded = true;
+        bool all_prefer_core_sharded = true;
+        for (int i = 0; i < this->num_tools_; ++i) {
+            if (this->tools_[i]->preferred_shard_type() == SHARD_BY_THREAD) {
+                all_prefer_core_sharded = false;
+            } else if (this->tools_[i]->preferred_shard_type() == SHARD_BY_CORE) {
+                all_prefer_thread_sharded = false;
+            }
+            if (this->parallel_ && !this->tools_[i]->parallel_shard_supported()) {
+                this->parallel_ = false;
+            }
+        }
+        if (all_prefer_core_sharded) {
+            // XXX i#6949: Ideally we could detect a core-sharded-on-disk input
+            // here and avoid this but that's not simple so currently we have a
+            // fatal error from the analyzer and the user must re-run with
+            // -no_core_sharded for such inputs.
+            if (this->parallel_) {
+                if (op_verbose.get_value() > 0)
+                    fprintf(stderr, "Enabling -core_sharded as all tools prefer it\n");
+                op_core_sharded.set_value(true);
+            } else {
+                if (op_verbose.get_value() > 0)
+                    fprintf(stderr, "Enabling -core_serial as all tools prefer it\n");
+                op_core_serial.set_value(true);
+            }
+        } else if (!all_prefer_thread_sharded) {
+            this->success_ = false;
+            this->error_string_ = "Selected tools differ in preferred sharding: please "
+                                  "re-run with -[no_]core_sharded or -[no_]core_serial";
+            return;
+        }
+    }
+
     typename sched_type_t::scheduler_options_t sched_ops;
     if (op_core_sharded.get_value() || op_core_serial.get_value()) {
+        if (!offline) {
+            // TODO i#7040: Add core-sharded support for online tools.
+            this->success_ = false;
+            this->error_string_ = "Core-sharded is not yet supported for online analysis";
+            return;
+        }
         if (op_core_serial.get_value()) {
             this->parallel_ = false;
         }
@@ -502,8 +549,10 @@ analyzer_multi_tmpl_t<RecordType, ReaderType>::analyzer_multi_tmpl_t()
             return;
         }
         if (!this->init_scheduler(tracedir, only_threads, only_shards,
-                                  op_verbose.get_value(), std::move(sched_ops)))
+                                  op_verbose.get_value(), std::move(sched_ops))) {
             this->success_ = false;
+            return;
+        }
     } else if (op_infile.get_value().empty()) {
         // XXX i#3323: Add parallel analysis support for online tools.
         this->parallel_ = false;
@@ -520,12 +569,15 @@ analyzer_multi_tmpl_t<RecordType, ReaderType>::analyzer_multi_tmpl_t()
         if (!this->init_scheduler(std::move(reader), std::move(end),
                                   op_verbose.get_value(), std::move(sched_ops))) {
             this->success_ = false;
+            return;
         }
     } else {
         // Legacy file.
         if (!this->init_scheduler(op_infile.get_value(), {}, {}, op_verbose.get_value(),
-                                  std::move(sched_ops)))
+                                  std::move(sched_ops))) {
             this->success_ = false;
+            return;
+        }
     }
     if (!init_analysis_tools()) {
         this->success_ = false;
diff --git a/clients/drcachesim/common/options.cpp b/clients/drcachesim/common/options.cpp
index 8e98e08ed9d..3bd7d7c4a03 100644
--- a/clients/drcachesim/common/options.cpp
+++ b/clients/drcachesim/common/options.cpp
@@ -299,13 +299,19 @@ droption_t<std::string> op_v2p_file(
 droption_t<bool> op_cpu_scheduling(
     DROPTION_SCOPE_CLIENT, "cpu_scheduling", false,
     "Map threads to cores matching recorded cpu execution",
-    "By default, the simulator schedules threads to simulated cores in a static "
+    "By default for online analysis, the simulator schedules threads to simulated cores "
+    "in a static "
     "round-robin fashion.  This option causes the scheduler to instead use the recorded "
     "cpu that each thread executed on (at a granularity of the trace buffer size) "
     "for scheduling, mapping traced cpu's to cores and running each segment of each "
     "thread on the core that owns the recorded cpu for that segment. "
     "This option is not supported with -core_serial; use "
-    "-cpu_schedule_file with -core_serial instead.");
+    "-cpu_schedule_file with -core_serial instead.  For offline analysis, the "
+    "recommendation is to not recreate the as-traced schedule (as it is not accurate due "
+    "to overhead) and instead use a dynamic schedule via -core_serial.  If only "
+    "core-sharded-preferring tools are enabled (e.g., " CPU_CACHE ", " TLB
+    ", " SCHEDULE_STATS
+    "), -core_serial is automatically turned on for offline analysis.");
 
 droption_t<bytesize_t> op_max_trace_size(
     DROPTION_SCOPE_CLIENT, "max_trace_size", 0,
@@ -890,11 +896,16 @@ droption_t<int> op_kernel_trace_buffer_size_shift(
 // Core-oriented analysis.
 droption_t<bool> op_core_sharded(
     DROPTION_SCOPE_ALL, "core_sharded", false, "Analyze per-core in parallel.",
-    "By default, the input trace is analyzed in parallel across shards equal to "
-    "software threads.  This option instead schedules those threads onto virtual cores "
+    "By default, the sharding mode is determined by the preferred shard type of the"
+    "tools selected (unless overridden, the default preferred type is thread-sharded). "
+    "This option enables core-sharded, overriding tool defaults.  Core-sharded "
+    "anlysis schedules the input software threads onto virtual cores "
     "and analyzes each core in parallel.  Thus, each shard consists of pieces from "
     "many software threads.  How the scheduling is performed is controlled by a set "
-    "of options with the prefix \"sched_\" along with -cores.");
+    "of options with the prefix \"sched_\" along with -cores.  If only "
+    "core-sharded-preferring tools are enabled (e.g., " CPU_CACHE ", " TLB
+    ", " SCHEDULE_STATS ") and they all support parallel operation, -core_sharded is "
+    "automatically turned on for offline analysis.");
 
 droption_t<bool> op_core_serial(
     DROPTION_SCOPE_ALL, "core_serial", false, "Analyze per-core in serial.",
@@ -902,7 +913,10 @@ droption_t<bool> op_core_serial(
     "However, the resulting schedule is acted upon by a single analysis thread"
     "which walks the N cores in lockstep in round robin fashion. "
     "How the scheduling is performed is controlled by a set "
-    "of options with the prefix \"sched_\" along with -cores.");
+    "of options with the prefix \"sched_\" along with -cores.  If only "
+    "core-sharded-preferring tools are enabled (e.g., " CPU_CACHE ", " TLB
+    ", " SCHEDULE_STATS ") and not all of them support parallel operation, "
+    "-core_serial is automatically turned on for offline analysis.");
 
 droption_t<int64_t>
     // We pick 10 million to match 2 instructions per nanosecond with a 5ms quantum.
diff --git a/clients/drcachesim/docs/drcachesim.dox.in b/clients/drcachesim/docs/drcachesim.dox.in
index 5cc2818c7bd..82ecaead2a9 100644
--- a/clients/drcachesim/docs/drcachesim.dox.in
+++ b/clients/drcachesim/docs/drcachesim.dox.in
@@ -1292,8 +1292,8 @@ Neither simulator has a simple way to know which core any particular thread
 executed on for each of its instructions.  The tracer records which core a
 thread is on each time it writes out a full trace buffer, giving an
 approximation of the actual scheduling: but this is not representative
-due to overhead (see \ref sec_drcachesim_as_traced).  By default, these cache and TLB
-simulators ignore that
+due to overhead (see \ref sec_drcachesim_as_traced).  For online analysis, by default,
+these cache and TLB simulators ignore that
 information and schedule threads to simulated cores in a static round-robin
 fashion with load balancing to fill in gaps with new threads after threads
 exit.  The option "-cpu_scheduling" (see \ref sec_drcachesim_ops) can be
@@ -1301,13 +1301,15 @@ used to instead map each physical cpu to a simulated core and use the
 recorded cpu that each segment of thread execution occurred on to schedule
 execution following the "as traced" schedule, but as just noted this is not
 representative.  Instead, we recommend using offline traces and dynamic
-re-scheduling as explained in \ref sec_drcachesim_sched_dynamic using the
-`-core_serial` parameter.  Here is an example:
+re-scheduling in core-sharded mode as explained in \ref sec_drcachesim_sched_dynamic
+using the
+`-core_serial` parameter.  In offline mode, `-core_serial` is the default for
+these simulators.
 
 \code
 $ bin64/drrun -t drmemtrace -offline -- ~/test/pi_estimator 8 20
 Estimation of pi is 3.141592653798125
-$ bin64/drrun -t drcachesim -core_serial -cores 3 -indir drmemtrace.pi_estimator.*.dir
+$ bin64/drrun -t drcachesim -cores 3 -indir drmemtrace.pi_estimator.*.dir
 Cache simulation results:
 Core #0 (traced CPU(s): #0)
   L1I0 (size=32768, assoc=8, block=64, LRU) stats:
@@ -1473,6 +1475,9 @@ The #dynamorio::drmemtrace::TRACE_MARKER_TYPE_TIMESTAMP and
 #dynamorio::drmemtrace::TRACE_MARKER_TYPE_CPU_ID markers are modified by the dynamic
 scheduler to reflect the new schedule.  The new timestamps maintain relative ordering
 but should not be relied upon to indicate accurate durations between events.
+When analyzing core-sharded-on-disk traces, `-no_core_sharded` must be passed when
+using core-sharded-preferring tools to avoid an error from the framework attempting
+to re-schedule the already-scheduled trace.
 
 Traces also include markers indicating disruptions in user mode control
 flow such as signal handler entry and exit.
@@ -1512,7 +1517,9 @@ the framework controls the iteration), to request the next trace
 record for each output on its own.  This scheduling is also available to any analysis tool
 when the input traces are sharded by core (see the `-core_sharded` and `-core_serial`
 and various `-sched_*` option documentation under \ref sec_drcachesim_ops as well as
-core-sharded notes when \ref sec_drcachesim_newtool).
+core-sharded notes when \ref sec_drcachesim_newtool), and in fact is the
+default when all tools prefer core-sharded operation via
+#dynamorio::drmemtrace::analysis_tool_t::preferred_shard_type().
 
 ********************
 \section sec_drcachesim_as_traced As-Traced Schedule Limitations
diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp
index 7b3f885777c..18c1a88e6dc 100644
--- a/clients/drcachesim/scheduler/scheduler.cpp
+++ b/clients/drcachesim/scheduler/scheduler.cpp
@@ -3245,9 +3245,14 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input(output_ordinal_t outpu
                                                           uint64_t blocked_time)
 {
     VDO(this, 1, {
-        static int global_heartbeat;
+        static int64_t global_heartbeat;
+        // 10K is too frequent for simple analyzer runs: it is too noisy with
+        // the new core-sharded-by-default for new users using defaults.
+        // 50K is a reasonable compromise.
+        // XXX: Add a runtime option to tweak this.
+        static constexpr int64_t GLOBAL_HEARTBEAT_CADENCE = 50000;
         // We are ok with races as the cadence is approximate.
-        if (++global_heartbeat % 10000 == 0) {
+        if (++global_heartbeat % GLOBAL_HEARTBEAT_CADENCE == 0) {
             print_queue_stats();
         }
     });
diff --git a/clients/drcachesim/simulator/cache_simulator.cpp b/clients/drcachesim/simulator/cache_simulator.cpp
index 0d5bc263007..e834b3b27ae 100644
--- a/clients/drcachesim/simulator/cache_simulator.cpp
+++ b/clients/drcachesim/simulator/cache_simulator.cpp
@@ -632,8 +632,7 @@ cache_simulator_t::print_results()
     std::cerr << "Cache simulation results:\n";
     // Print core and associated L1 cache stats first.
     for (unsigned int i = 0; i < knobs_.num_cores; i++) {
-        print_core(i);
-        if (shard_type_ == SHARD_BY_CORE || thread_ever_counts_[i] > 0) {
+        if (print_core(i)) {
             if (l1_icaches_[i] != l1_dcaches_[i]) {
                 std::cerr << "  " << l1_icaches_[i]->get_name() << " ("
                           << l1_icaches_[i]->get_description() << ") stats:" << std::endl;
diff --git a/clients/drcachesim/simulator/simulator.cpp b/clients/drcachesim/simulator/simulator.cpp
index 9199de6bad3..5c61006a7b3 100644
--- a/clients/drcachesim/simulator/simulator.cpp
+++ b/clients/drcachesim/simulator/simulator.cpp
@@ -311,18 +311,19 @@ simulator_t::handle_thread_exit(memref_tid_t tid)
     thread2core_.erase(tid);
 }
 
-void
+bool
 simulator_t::print_core(int core) const
 {
     if (!knob_cpu_scheduling_ && shard_type_ == SHARD_BY_THREAD) {
         std::cerr << "Core #" << core << " (" << thread_ever_counts_[core]
                   << " thread(s))" << std::endl;
+        return thread_ever_counts_[core] > 0;
     } else {
         std::cerr << "Core #" << core;
         if (shard_type_ == SHARD_BY_THREAD && cpu_counts_[core] == 0) {
             // We keep the "(s)" mainly to simplify test templates.
             std::cerr << " (0 traced CPU(s))" << std::endl;
-            return;
+            return false;
         }
         std::cerr << " (";
         if (shard_type_ == SHARD_BY_THREAD) // Always 1:1 for SHARD_BY_CORE.
@@ -338,6 +339,8 @@ simulator_t::print_core(int core) const
             }
         }
         std::cerr << ")" << std::endl;
+        // If anything ran on this core, need_comma will be true.
+        return need_comma;
     }
 }
 
diff --git a/clients/drcachesim/simulator/simulator.h b/clients/drcachesim/simulator/simulator.h
index 78e434a6517..c3359e265da 100644
--- a/clients/drcachesim/simulator/simulator.h
+++ b/clients/drcachesim/simulator/simulator.h
@@ -69,6 +69,13 @@ class simulator_t : public analysis_tool_t {
     std::string
     initialize_shard_type(shard_type_t shard_type) override;
 
+    shard_type_t
+    preferred_shard_type() override
+    {
+        // We prefer a dynamic schedule with more realistic thread interleavings.
+        return SHARD_BY_CORE;
+    }
+
     bool
     process_memref(const memref_t &memref) override;
 
@@ -83,7 +90,8 @@ class simulator_t : public analysis_tool_t {
                double warmup_fraction, uint64_t sim_refs, bool cpu_scheduling,
                bool use_physical, unsigned int verbose);
 
-    void
+    // Returns whether the core was ever non-empty.
+    bool
     print_core(int core) const;
 
     int
diff --git a/clients/drcachesim/simulator/tlb_simulator.cpp b/clients/drcachesim/simulator/tlb_simulator.cpp
index f5ac4ff9e1b..396a9c9cab7 100644
--- a/clients/drcachesim/simulator/tlb_simulator.cpp
+++ b/clients/drcachesim/simulator/tlb_simulator.cpp
@@ -264,8 +264,7 @@ tlb_simulator_t::print_results()
 {
     std::cerr << "TLB simulation results:\n";
     for (unsigned int i = 0; i < knobs_.num_cores; i++) {
-        print_core(i);
-        if (thread_ever_counts_[i] > 0) {
+        if (print_core(i)) {
             std::cerr << "  L1I stats:" << std::endl;
             itlbs_[i]->get_stats()->print_stats("    ");
             std::cerr << "  L1D stats:" << std::endl;
diff --git a/clients/drcachesim/tests/offline-burst_client.templatex b/clients/drcachesim/tests/offline-burst_client.templatex
index 3bc0005808f..cd0fb000c19 100644
--- a/clients/drcachesim/tests/offline-burst_client.templatex
+++ b/clients/drcachesim/tests/offline-burst_client.templatex
@@ -23,7 +23,7 @@ DynamoRIO statistics:
 .*
 all done
 Cache simulation results:
-Core #0 \(1 thread\(s\)\)
+Core #0 \(traced CPU\(s\): #0\)
   L1I0 .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
@@ -36,9 +36,9 @@ Core #0 \(1 thread\(s\)\)
     Compulsory misses:            *[0-9,\.]*
     Invalidations:                *0
 .*   Miss rate:                        [0-3][,\.]..%
-Core #1 \(0 thread\(s\)\)
-Core #2 \(0 thread\(s\)\)
-Core #3 \(0 thread\(s\)\)
+Core #1 \(traced CPU\(s\): \)
+Core #2 \(traced CPU\(s\): \)
+Core #3 \(traced CPU\(s\): \)
 LL .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
diff --git a/clients/drcachesim/tests/offline-burst_maps.templatex b/clients/drcachesim/tests/offline-burst_maps.templatex
index 50ddb5c76bb..a851855f7c3 100644
--- a/clients/drcachesim/tests/offline-burst_maps.templatex
+++ b/clients/drcachesim/tests/offline-burst_maps.templatex
@@ -11,7 +11,7 @@ pre-DR start
 pre-DR detach
 all done
 Cache simulation results:
-Core #0 \(1 thread\(s\)\)
+Core #0 \(traced CPU\(s\): #0\)
   L1I0 .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
@@ -24,9 +24,9 @@ Core #0 \(1 thread\(s\)\)
     Compulsory misses:            *[0-9,\.]*
     Invalidations:                *0
 .*   Miss rate:                        [0-3][,\.]..%
-Core #1 \(0 thread\(s\)\)
-Core #2 \(0 thread\(s\)\)
-Core #3 \(0 thread\(s\)\)
+Core #1 \(traced CPU\(s\): \)
+Core #2 \(traced CPU\(s\): \)
+Core #3 \(traced CPU\(s\): \)
 LL .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
diff --git a/clients/drcachesim/tests/offline-burst_noreach.templatex b/clients/drcachesim/tests/offline-burst_noreach.templatex
index 50ddb5c76bb..a851855f7c3 100644
--- a/clients/drcachesim/tests/offline-burst_noreach.templatex
+++ b/clients/drcachesim/tests/offline-burst_noreach.templatex
@@ -11,7 +11,7 @@ pre-DR start
 pre-DR detach
 all done
 Cache simulation results:
-Core #0 \(1 thread\(s\)\)
+Core #0 \(traced CPU\(s\): #0\)
   L1I0 .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
@@ -24,9 +24,9 @@ Core #0 \(1 thread\(s\)\)
     Compulsory misses:            *[0-9,\.]*
     Invalidations:                *0
 .*   Miss rate:                        [0-3][,\.]..%
-Core #1 \(0 thread\(s\)\)
-Core #2 \(0 thread\(s\)\)
-Core #3 \(0 thread\(s\)\)
+Core #1 \(traced CPU\(s\): \)
+Core #2 \(traced CPU\(s\): \)
+Core #3 \(traced CPU\(s\): \)
 LL .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
diff --git a/clients/drcachesim/tests/offline-burst_replace.templatex b/clients/drcachesim/tests/offline-burst_replace.templatex
index 6ef8c7388c0..3b6bc0c5ac3 100644
--- a/clients/drcachesim/tests/offline-burst_replace.templatex
+++ b/clients/drcachesim/tests/offline-burst_replace.templatex
@@ -19,7 +19,7 @@ close file .*
 close file .*
 all done
 Cache simulation results:
-Core #0 \(1 thread\(s\)\)
+Core #0 \(traced CPU\(s\): #0\)
   L1I0 .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
@@ -32,9 +32,9 @@ Core #0 \(1 thread\(s\)\)
     Compulsory misses:            *[0-9,\.]*
     Invalidations:                *0
 .*   Miss rate:                        [0-3][,\.]..%
-Core #1 \(0 thread\(s\)\)
-Core #2 \(0 thread\(s\)\)
-Core #3 \(0 thread\(s\)\)
+Core #1 \(traced CPU\(s\): \)
+Core #2 \(traced CPU\(s\): \)
+Core #3 \(traced CPU\(s\): \)
 LL .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
diff --git a/clients/drcachesim/tests/offline-burst_static.templatex b/clients/drcachesim/tests/offline-burst_static.templatex
index 1e1446cce3b..5fcc4fa8959 100644
--- a/clients/drcachesim/tests/offline-burst_static.templatex
+++ b/clients/drcachesim/tests/offline-burst_static.templatex
@@ -20,7 +20,7 @@ DynamoRIO statistics:
 .*
 all done
 Cache simulation results:
-Core #0 \(1 thread\(s\)\)
+Core #0 \(traced CPU\(s\): #0\)
   L1I0 .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
@@ -33,9 +33,9 @@ Core #0 \(1 thread\(s\)\)
     Compulsory misses:            *[0-9,\.]*
     Invalidations:                *0
 .*   Miss rate:                        [0-3][,\.]..%
-Core #1 \(0 thread\(s\)\)
-Core #2 \(0 thread\(s\)\)
-Core #3 \(0 thread\(s\)\)
+Core #1 \(traced CPU\(s\): \)
+Core #2 \(traced CPU\(s\): \)
+Core #3 \(traced CPU\(s\): \)
 LL .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
diff --git a/clients/drcachesim/tests/offline-filter-and-instr-only-trace.templatex b/clients/drcachesim/tests/offline-filter-and-instr-only-trace.templatex
index 0c3712398cd..4ef96d3d3e9 100644
--- a/clients/drcachesim/tests/offline-filter-and-instr-only-trace.templatex
+++ b/clients/drcachesim/tests/offline-filter-and-instr-only-trace.templatex
@@ -1,6 +1,6 @@
 Hello, world!
 Cache simulation results:
-Core #0 \(1 thread\(s\)\)
+Core #0 \(traced CPU\(s\): #0\)
   L1I0 .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
@@ -12,9 +12,9 @@ Core #0 \(1 thread\(s\)\)
     Misses:                              0
     Compulsory misses:                   *[0-9,\.]*
     Invalidations:                       0
-Core #1 \(0 thread\(s\)\)
-Core #2 \(0 thread\(s\)\)
-Core #3 \(0 thread\(s\)\)
+Core #1 \(traced CPU\(s\): \)
+Core #2 \(traced CPU\(s\): \)
+Core #3 \(traced CPU\(s\): \)
 LL .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
diff --git a/clients/drcachesim/tests/offline-filter.templatex b/clients/drcachesim/tests/offline-filter.templatex
index 326b785b35a..7db8cbdfaa1 100644
--- a/clients/drcachesim/tests/offline-filter.templatex
+++ b/clients/drcachesim/tests/offline-filter.templatex
@@ -1,6 +1,6 @@
 Hello, world!
 Cache simulation results:
-Core #0 \(1 thread\(s\)\)
+Core #0 \(traced CPU\(s\): #0\)
   L1I0 .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
@@ -13,9 +13,9 @@ Core #0 \(1 thread\(s\)\)
     Compulsory misses:            *[0-9,\.]*
     Invalidations:                *0
 .*   Miss rate:                   *[0-9]?[0-9][,\.]..%
-Core #1 \(0 thread\(s\)\)
-Core #2 \(0 thread\(s\)\)
-Core #3 \(0 thread\(s\)\)
+Core #1 \(traced CPU\(s\): \)
+Core #2 \(traced CPU\(s\): \)
+Core #3 \(traced CPU\(s\): \)
 LL .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
diff --git a/clients/drcachesim/tests/offline-fork.templatex b/clients/drcachesim/tests/offline-fork.templatex
index d4a5a4b2402..f8991518514 100644
--- a/clients/drcachesim/tests/offline-fork.templatex
+++ b/clients/drcachesim/tests/offline-fork.templatex
@@ -3,7 +3,7 @@ parent waiting for child
 child is running under DynamoRIO
 child has exited
 Cache simulation results:
-Core #0 \(1 thread\(s\)\)
+Core #0 \(traced CPU\(s\): #0\)
   L1I0 .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
@@ -16,9 +16,9 @@ Core #0 \(1 thread\(s\)\)
     Compulsory misses:            *[0-9,\.]*
     Invalidations:                *0
 .*   Miss rate:                        [0-9][,\.]..%
-Core #1 \(0 thread\(s\)\)
-Core #2 \(0 thread\(s\)\)
-Core #3 \(0 thread\(s\)\)
+Core #1 \(traced CPU\(s\): \)
+Core #2 \(traced CPU\(s\): \)
+Core #3 \(traced CPU\(s\): \)
 LL .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
diff --git a/clients/drcachesim/tests/offline-instr-only-trace.templatex b/clients/drcachesim/tests/offline-instr-only-trace.templatex
index 0c3712398cd..4ef96d3d3e9 100644
--- a/clients/drcachesim/tests/offline-instr-only-trace.templatex
+++ b/clients/drcachesim/tests/offline-instr-only-trace.templatex
@@ -1,6 +1,6 @@
 Hello, world!
 Cache simulation results:
-Core #0 \(1 thread\(s\)\)
+Core #0 \(traced CPU\(s\): #0\)
   L1I0 .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
@@ -12,9 +12,9 @@ Core #0 \(1 thread\(s\)\)
     Misses:                              0
     Compulsory misses:                   *[0-9,\.]*
     Invalidations:                       0
-Core #1 \(0 thread\(s\)\)
-Core #2 \(0 thread\(s\)\)
-Core #3 \(0 thread\(s\)\)
+Core #1 \(traced CPU\(s\): \)
+Core #2 \(traced CPU\(s\): \)
+Core #3 \(traced CPU\(s\): \)
 LL .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
diff --git a/clients/drcachesim/tests/offline-multiproc.templatex b/clients/drcachesim/tests/offline-multiproc.templatex
index 76b03014e8e..1555a744eaa 100644
--- a/clients/drcachesim/tests/offline-multiproc.templatex
+++ b/clients/drcachesim/tests/offline-multiproc.templatex
@@ -1,6 +1,6 @@
 all done
 Cache simulation results:
-Core #0 \(1 thread\(s\)\)
+Core #0 \(traced CPU\(s\): #0\)
   L1I0 .* stats:
     Hits:                    *[0-9\.,]*
     Misses:                  *[0-9,\.]*
@@ -13,9 +13,9 @@ Core #0 \(1 thread\(s\)\)
     Compulsory misses:       *[0-9\.,]*
     Invalidations:           *0
 .*   Miss rate:              *[0-9]*[,\.]..%
-Core #1 \(0 thread\(s\)\)
-Core #2 \(0 thread\(s\)\)
-Core #3 \(0 thread\(s\)\)
+Core #1 \(traced CPU\(s\): \)
+Core #2 \(traced CPU\(s\): \)
+Core #3 \(traced CPU\(s\): \)
 LL .* stats:
     Hits:                    *[0-9\.,]*
     Misses:                  *[0-9\.,]*
diff --git a/clients/drcachesim/tests/offline-simple.templatex b/clients/drcachesim/tests/offline-simple.templatex
index c35d941b14b..3f1e8897be4 100644
--- a/clients/drcachesim/tests/offline-simple.templatex
+++ b/clients/drcachesim/tests/offline-simple.templatex
@@ -1,6 +1,6 @@
 Hello, world!
 Cache simulation results:
-Core #0 \(1 thread\(s\)\)
+Core #0 \(traced CPU\(s\): #0\)
   L1I0 .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
@@ -13,9 +13,9 @@ Core #0 \(1 thread\(s\)\)
     Compulsory misses:            *[0-9,\.]*
     Invalidations:                *0
 .*   Miss rate:                        [0-9][,\.]..%
-Core #1 \(0 thread\(s\)\)
-Core #2 \(0 thread\(s\)\)
-Core #3 \(0 thread\(s\)\)
+Core #1 \(traced CPU\(s\): \)
+Core #2 \(traced CPU\(s\): \)
+Core #3 \(traced CPU\(s\): \)
 LL .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
diff --git a/clients/drcachesim/tools/schedule_stats.h b/clients/drcachesim/tools/schedule_stats.h
index db3e217b5d8..61e175a4c5e 100644
--- a/clients/drcachesim/tools/schedule_stats.h
+++ b/clients/drcachesim/tools/schedule_stats.h
@@ -65,6 +65,11 @@ class schedule_stats_t : public analysis_tool_t {
     print_results() override;
     bool
     parallel_shard_supported() override;
+    shard_type_t
+    preferred_shard_type() override
+    {
+        return SHARD_BY_CORE;
+    }
     void *
     parallel_shard_init_stream(int shard_index, void *worker_data,
                                memtrace_stream_t *stream) override;
diff --git a/suite/tests/CMakeLists.txt b/suite/tests/CMakeLists.txt
index e7939ddaac1..5bb52650edc 100644
--- a/suite/tests/CMakeLists.txt
+++ b/suite/tests/CMakeLists.txt
@@ -2131,7 +2131,8 @@ if (AARCH64 AND UNIX AND ZLIB_FOUND)
     prefix_cmd_if_necessary(drcachesim_path ON ${drcachesim_path})
     torunonly_api(tool.drcacheoff.tlb_simulator_v2p "${drcachesim_path}"
       "offline-tlb_simulator_v2p" ""
-      "-indir;${locdir};-tool;TLB;-alt_module_dir;${srcdir};-module_file;${locdir}/raw/modules.log;-v2p_file;${srcdir}/v2p.textproto;-use_physical"
+      # Do not use core-sharded scheduling, so we'll have a deterministic result.
+      "-indir;${locdir};-tool;TLB;-alt_module_dir;${srcdir};-module_file;${locdir}/raw/modules.log;-v2p_file;${srcdir}/v2p.textproto;-use_physical;-no_core_sharded"
       OFF OFF)
     set(tool.drcacheoff.tlb_simulator_v2p_basedir
       "${PROJECT_SOURCE_DIR}/clients/drcachesim/tests")
@@ -3976,12 +3977,14 @@ if (BUILD_CLIENTS)
         set(tool.core_on_disk_rawtemp ON) # no preprocessor
 
         torunonly_simtool(core_on_disk_schedule ${ci_shared_app}
-          "-indir ${core_sharded_dir} -tool schedule_stats" "")
+          # Avoid the default core-sharded from re-scheduling the trace.
+          "-indir ${core_sharded_dir} -tool schedule_stats -no_core_sharded" "")
         set(tool.core_on_disk_schedule_rawtemp ON) # no preprocessor
 
         # Test -only_shards on core-sharded-on-disk traces.
         torunonly_simtool(only_shards ${ci_shared_app}
-          "-indir ${core_sharded_dir} -tool schedule_stats -only_shards 2,3" "")
+          # Avoid the default core-sharded from re-scheduling the trace.
+          "-indir ${core_sharded_dir} -tool schedule_stats -only_shards 2,3 -no_core_sharded" "")
         set(tool.core_on_disk_rawtemp ON) # no preprocessor
       endif ()
     endif ()
@@ -4100,10 +4103,10 @@ if (BUILD_CLIENTS)
       torunonly_drcacheoff(fork linux.fork "" "" "")
     endif ()
 
-    # Test reading a legacy pre-interleaved file.
+    # Test reading a legacy pre-interleaved file in thread-sharded mode.
     if (ZLIB_FOUND)
       torunonly_api(tool.drcacheoff.legacy "${drcachesim_path}" "offline-legacy.c" ""
-        "-infile;${PROJECT_SOURCE_DIR}/clients/drcachesim/tests/offline-legacy-trace.gz"
+        "-infile;${PROJECT_SOURCE_DIR}/clients/drcachesim/tests/offline-legacy-trace.gz;-no_core_sharded"
         OFF OFF)
       set(tool.drcacheoff.legacy_basedir
         "${PROJECT_SOURCE_DIR}/clients/drcachesim/tests")
@@ -4139,16 +4142,20 @@ if (BUILD_CLIENTS)
     # Test reading a trace in sharded snappy-compressed files.
     if (libsnappy)
       # with a parallel tool (basic_counts)
-      torunonly_api(tool.drcacheoff.snappy "${drcachesim_path}" "offline-snappy.c" ""
+      torunonly_api(tool.drcacheoff.snappy_parallel "${drcachesim_path}" "offline-snappy.c" ""
         "-indir;${PROJECT_SOURCE_DIR}/clients/drcachesim/tests/drmemtrace.chase-snappy.x64.tracedir;-tool;basic_counts"
         OFF OFF)
+      set(tool.drcacheoff.snappy_parallel_basedir
+        "${PROJECT_SOURCE_DIR}/clients/drcachesim/tests")
+      set(tool.drcacheoff.snappy_parallel_expectbase "offline-snappy")
 
-      # with a legacy serial tool (full simulator)
-      torunonly_api(tool.drcacheoff.snappy "${drcachesim_path}" "offline-snappy-serial.c" ""
-        "-indir;${PROJECT_SOURCE_DIR}/clients/drcachesim/tests/drmemtrace.chase-snappy.x64.tracedir"
+      # With a legacy serial tool (full simulator) in thread-sharded mode.
+      torunonly_api(tool.drcacheoff.snappy_serial "${drcachesim_path}" "offline-snappy-serial.c" ""
+        "-indir;${PROJECT_SOURCE_DIR}/clients/drcachesim/tests/drmemtrace.chase-snappy.x64.tracedir;-no_core_sharded"
         OFF OFF)
-      set(tool.drcacheoff.snappy_basedir
+      set(tool.drcacheoff.snappy_serial_basedir
         "${PROJECT_SOURCE_DIR}/clients/drcachesim/tests")
+      set(tool.drcacheoff.snappy_serial_expectbase "offline-snappy-serial")
     endif()
 
     if (UNIX) # Enable on Windows once i#1727 is fixed.
@@ -4441,15 +4448,16 @@ if (BUILD_CLIENTS)
         # Run with -trace_after_instrs to ensure we test the
         # drbbdup + rseq combo (i#5658, i#5659).
         "-trace_after_instrs 5K"
-        "@${test_mode_flag}@-test_mode_name@rseq_app" "")
+        # Run thread-sharded for the invariant checker.
+        "@${test_mode_flag}@-test_mode_name@rseq_app@-no_core_sharded" "")
       # Test filtering.
       torunonly_drcacheoff(rseq-filter linux.rseq
         "-trace_after_instrs 5K -L0_filter"
-        "@${test_mode_flag}@-test_mode_name@rseq_app" "")
+        "@${test_mode_flag}@-test_mode_name@rseq_app@-no_core_sharded" "")
       set(tool.drcacheoff.rseq-filter_expectbase "offline-rseq")
       torunonly_drcacheoff(rseq-dfilter linux.rseq
         "-trace_after_instrs 5K -L0D_filter"
-        "@${test_mode_flag}@-test_mode_name@rseq_app" "")
+        "@${test_mode_flag}@-test_mode_name@rseq_app@-no_core_sharded" "")
       set(tool.drcacheoff.rseq-dfilter_expectbase "offline-rseq")
     endif ()
 
@@ -4746,9 +4754,9 @@ if (BUILD_CLIENTS)
       # Run the record filter tool with a null filter.
       set(${testname}_postcmd2 "${CMAKE_COMMAND}@-E@make_directory@${outdir}")
       set(${testname}_postcmd3 ${launch_cmd})
-      # Run the analyzer on the result.
+      # Run the analyzer on the result.  Avoid double-core-sharding.
       set(${testname}_postcmd4
-        "${drcachesim_path}@-indir@${outdir}@-tool@${analyzer}")
+        "${drcachesim_path}@-indir@${outdir}@-tool@${analyzer}@-no_core_sharded")
     endmacro ()
 
     set(testname "tool.record_filter")
@@ -4884,7 +4892,7 @@ if (BUILD_CLIENTS)
       file(MAKE_DIRECTORY ${outdir})
       torunonly_api(tool.record_filter_as_traced "${drcachesim_path}"
         "record_filter_as_traced"
-        "" "-tool;schedule_stats;-indir;${outdir}" OFF OFF)
+        "" "-tool;schedule_stats;-no_core_sharded;-indir;${outdir}" OFF OFF)
       set(tool.record_filter_as_traced_runcmp "${CMAKE_CURRENT_SOURCE_DIR}/runmulti.cmake")
       set(tool.record_filter_as_traced_precmd
         "${drcachesim_path}@-tool@record_filter@-cpu_schedule_file@${sched_file}@-core_sharded@-cores@7@-indir@${trace_dir}@-outdir@${outdir}")
@@ -4901,7 +4909,7 @@ if (BUILD_CLIENTS)
       file(MAKE_DIRECTORY ${outdir})
       torunonly_api(tool.record_filter_start_idle "${drcachesim_path}"
         "record_filter_start_idle"
-        "" "-tool;schedule_stats;-indir;${outdir}" OFF OFF)
+        "" "-tool;schedule_stats;-indir;${outdir};-no_core_sharded" OFF OFF)
       set(tool.record_filter_start_idle_runcmp "${CMAKE_CURRENT_SOURCE_DIR}/runmulti.cmake")
       set(tool.record_filter_start_idle_precmd
         "${drcachesim_path}@-tool@record_filter@-cpu_schedule_file@${sched_file}@-core_sharded@-cores@4@-indir@${trace_dir}@-outdir@${outdir}")