diff --git a/CMakeLists.txt b/CMakeLists.txt
index ba527facc36..c0c696c415d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 # **********************************************************
-# Copyright (c) 2010-2023 Google, Inc.    All rights reserved.
+# Copyright (c) 2010-2024 Google, Inc.    All rights reserved.
 # Copyright (c) 2009-2010 VMware, Inc.    All rights reserved.
 # Copyright (c) 2018 Arm Limited          All rights reserved.
 # **********************************************************
@@ -1840,7 +1840,8 @@ endfunction ()
 # TODO i#5767: Install an explicit zlib package on our Windows GA CI images
 # (this find_package finds a strawberry perl zlib which causes 32-bit build
 # and 64-bit private loader issues).
-if (WIN32 AND AUTOMATED_TESTING)
+option(DISABLE_ZLIB "Disable looking for and using zlib" OFF)
+if (WIN32 AND NOT DISABLE_ZLIB)
   set(ZLIB_FOUND OFF)
 else ()
   find_package(ZLIB)
diff --git a/api/docs/release.dox b/api/docs/release.dox
index f43af9d3274..a4a7d9f5049 100644
--- a/api/docs/release.dox
+++ b/api/docs/release.dox
@@ -265,6 +265,16 @@ Further non-compatibility-affecting changes include:
  - Added -trace_instr_intervals_file option to the drmemtrace trace analysis tools
    framework. The file must be in CSV format containing a <start,duration> tracing
    interval per line where start and duration are expressed in number of instructions.
+ - Added modify_marker_value_filter_t to #dynamorio::drmemtrace::record_filter_t to modify
+   the value of TRACE_MARKER_TYPE_ markers. This filter takes a list of
+   <TRACE_MARKER_TYPE_,new_value> and changes every listed marker in the trace to its
+   corresponding new_value.
+ - Added trace_analysis_tool::preferred_shard_type() to the drmemtrace framework to
+   allow switching to core-sharded by default if all tools prefer that mode.
+ - For the drmemtrace framework, if only core-sharded-preferring tools are enabled
+   (these include cache and TLB simulators and the schedule_stats tool), -core_sharded or
+   -core_serial is automatically turned on for offline analysis to enable more
+   representative simulated software thread scheduling onto virtual cores.
 
 **************************************************
 <hr>
diff --git a/clients/drcachesim/CMakeLists.txt b/clients/drcachesim/CMakeLists.txt
index 81e54281312..f8ea8feee24 100644
--- a/clients/drcachesim/CMakeLists.txt
+++ b/clients/drcachesim/CMakeLists.txt
@@ -201,6 +201,7 @@ add_exported_library(drmemtrace_record_filter STATIC
   tools/filter/type_filter.h
   tools/filter/encodings2regdeps_filter.h
   tools/filter/func_id_filter.h
+  tools/filter/modify_marker_value_filter.h
   tools/filter/null_filter.h)
 target_link_libraries(drmemtrace_record_filter drmemtrace_simulator
   drmemtrace_schedule_file)
@@ -1086,6 +1087,17 @@ if (BUILD_TESTS)
       use_DynamoRIO_drmemtrace_tracer(tool.drcacheoff.burst_syscall_inject)
     endif ()
 
+    if (LINUX AND BUILD_PT_POST_PROCESSOR AND BUILD_PT_TRACER)
+      add_executable(tool.drcacheoff.burst_syscall_pt_SUDO tests/burst_syscall_pt.cpp)
+      configure_DynamoRIO_static(tool.drcacheoff.burst_syscall_pt_SUDO)
+      use_DynamoRIO_static_client(tool.drcacheoff.burst_syscall_pt_SUDO drmemtrace_static)
+      target_link_libraries(tool.drcacheoff.burst_syscall_pt_SUDO drmemtrace_raw2trace
+          drmemtrace_analyzer test_helpers drmemtrace_basic_counts)
+      add_win32_flags(tool.drcacheoff.burst_syscall_pt_SUDO)
+      use_DynamoRIO_drmemtrace_tracer(tool.drcacheoff.burst_syscall_pt_SUDO)
+      link_with_pthread(tool.drcacheoff.burst_syscall_pt_SUDO)
+    endif ()
+
     if (UNIX)
       if (X86 AND NOT APPLE) # This test is x86-specific.
         # uses ptrace and looks for linux-specific syscalls
diff --git a/clients/drcachesim/analysis_tool.h b/clients/drcachesim/analysis_tool.h
index 24306cd7534..c5cae5f2b55 100644
--- a/clients/drcachesim/analysis_tool.h
+++ b/clients/drcachesim/analysis_tool.h
@@ -156,6 +156,19 @@ template <typename RecordType> class analysis_tool_tmpl_t {
     {
         return "";
     }
+    /**
+     * Identifies the preferred shard type for this analysis.  This only applies when
+     * the user does not specify a shard type for a run.  In that case, if every tool
+     * being run prefers #SHARD_BY_CORE, the framework uses that mode.  If tools
+     * disagree then an error is raised.  This is ignored if the user specifies a
+     * shard type via one of -core_sharded, -core_serial, -no_core_sharded,
+     * -no_core_serial, or -cpu_scheduling.
+     */
+    virtual shard_type_t
+    preferred_shard_type()
+    {
+        return SHARD_BY_THREAD;
+    }
     /** Returns whether the tool was created successfully. */
     virtual bool
     operator!()
diff --git a/clients/drcachesim/analyzer.cpp b/clients/drcachesim/analyzer.cpp
index a4f860f4ec4..728648171fa 100644
--- a/clients/drcachesim/analyzer.cpp
+++ b/clients/drcachesim/analyzer.cpp
@@ -339,6 +339,19 @@ analyzer_tmpl_t<RecordType, ReaderType>::init_scheduler_common(
             uint64_t filetype = scheduler_.get_stream(i)->get_filetype();
             VPRINT(this, 2, "Worker %d filetype %" PRIx64 "\n", i, filetype);
             if (TESTANY(OFFLINE_FILE_TYPE_CORE_SHARDED, filetype)) {
+                if (i == 0 && shard_type_ == SHARD_BY_CORE) {
+                    // This is almost certainly user error.
+                    // Better to exit than risk user confusion.
+                    // XXX i#7045: Ideally this could be reported as an error by the
+                    // scheduler, and also detected early in analyzer_multi to auto-fix
+                    // (when no mode is specified: if the user specifies core-sharding
+                    // there could be config differences and this should be an error),
+                    // but neither is simple so today the user has to re-run.
+                    error_string_ =
+                        "Re-scheduling a core-sharded-on-disk trace is generally a "
+                        "mistake; re-run with -no_core_sharded.\n";
+                    return false;
+                }
                 shard_type_ = SHARD_BY_CORE;
             }
         }
@@ -524,7 +537,7 @@ analyzer_tmpl_t<RecordType, ReaderType>::process_serial(analyzer_worker_data_t &
     while (true) {
         RecordType record;
         // The current time is used for time quanta; for instr quanta, it's ignored and
-        // we pass 0.
+        // we pass 0 and let the scheduler use instruction + idle counts.
         uint64_t cur_micros = sched_by_time_ ? get_current_microseconds() : 0;
         typename sched_type_t::stream_status_t status =
             worker.stream->next_record(record, cur_micros);
diff --git a/clients/drcachesim/analyzer_multi.cpp b/clients/drcachesim/analyzer_multi.cpp
index beb0e526e6f..521ae6bf488 100644
--- a/clients/drcachesim/analyzer_multi.cpp
+++ b/clients/drcachesim/analyzer_multi.cpp
@@ -339,7 +339,8 @@ record_analyzer_multi_t::create_analysis_tool_from_options(const std::string &to
             op_filter_cache_size.get_value(), op_filter_trace_types.get_value(),
             op_filter_marker_types.get_value(), op_trim_before_timestamp.get_value(),
             op_trim_after_timestamp.get_value(), op_encodings2regdeps.get_value(),
-            op_filter_func_ids.get_value(), op_verbose.get_value());
+            op_filter_func_ids.get_value(), op_modify_marker_value.get_value(),
+            op_verbose.get_value());
     }
     ERRMSG("Usage error: unsupported record analyzer type \"%s\".  Only " RECORD_FILTER
            " is supported.\n",
@@ -461,6 +462,7 @@ analyzer_multi_tmpl_t<RecordType, ReaderType>::analyzer_multi_tmpl_t()
             if (!error.empty()) {
                 this->success_ = false;
                 this->error_string_ = "raw2trace failed: " + error;
+                return;
             }
         }
     }
@@ -472,8 +474,54 @@ analyzer_multi_tmpl_t<RecordType, ReaderType>::analyzer_multi_tmpl_t()
         return;
     }
 
+    bool sharding_specified = op_core_sharded.specified() || op_core_serial.specified() ||
+        // -cpu_scheduling implies thread-sharded.
+        op_cpu_scheduling.get_value();
+    // TODO i#7040: Add core-sharded support for online tools.
+    bool offline = !op_indir.get_value().empty() || !op_infile.get_value().empty();
+    if (offline && !sharding_specified) {
+        bool all_prefer_thread_sharded = true;
+        bool all_prefer_core_sharded = true;
+        for (int i = 0; i < this->num_tools_; ++i) {
+            if (this->tools_[i]->preferred_shard_type() == SHARD_BY_THREAD) {
+                all_prefer_core_sharded = false;
+            } else if (this->tools_[i]->preferred_shard_type() == SHARD_BY_CORE) {
+                all_prefer_thread_sharded = false;
+            }
+            if (this->parallel_ && !this->tools_[i]->parallel_shard_supported()) {
+                this->parallel_ = false;
+            }
+        }
+        if (all_prefer_core_sharded) {
+            // XXX i#6949: Ideally we could detect a core-sharded-on-disk input
+            // here and avoid this but that's not simple so currently we have a
+            // fatal error from the analyzer and the user must re-run with
+            // -no_core_sharded for such inputs.
+            if (this->parallel_) {
+                if (op_verbose.get_value() > 0)
+                    fprintf(stderr, "Enabling -core_sharded as all tools prefer it\n");
+                op_core_sharded.set_value(true);
+            } else {
+                if (op_verbose.get_value() > 0)
+                    fprintf(stderr, "Enabling -core_serial as all tools prefer it\n");
+                op_core_serial.set_value(true);
+            }
+        } else if (!all_prefer_thread_sharded) {
+            this->success_ = false;
+            this->error_string_ = "Selected tools differ in preferred sharding: please "
+                                  "re-run with -[no_]core_sharded or -[no_]core_serial";
+            return;
+        }
+    }
+
     typename sched_type_t::scheduler_options_t sched_ops;
     if (op_core_sharded.get_value() || op_core_serial.get_value()) {
+        if (!offline) {
+            // TODO i#7040: Add core-sharded support for online tools.
+            this->success_ = false;
+            this->error_string_ = "Core-sharded is not yet supported for online analysis";
+            return;
+        }
         if (op_core_serial.get_value()) {
             this->parallel_ = false;
         }
@@ -501,8 +549,10 @@ analyzer_multi_tmpl_t<RecordType, ReaderType>::analyzer_multi_tmpl_t()
             return;
         }
         if (!this->init_scheduler(tracedir, only_threads, only_shards,
-                                  op_verbose.get_value(), std::move(sched_ops)))
+                                  op_verbose.get_value(), std::move(sched_ops))) {
             this->success_ = false;
+            return;
+        }
     } else if (op_infile.get_value().empty()) {
         // XXX i#3323: Add parallel analysis support for online tools.
         this->parallel_ = false;
@@ -519,12 +569,15 @@ analyzer_multi_tmpl_t<RecordType, ReaderType>::analyzer_multi_tmpl_t()
         if (!this->init_scheduler(std::move(reader), std::move(end),
                                   op_verbose.get_value(), std::move(sched_ops))) {
             this->success_ = false;
+            return;
         }
     } else {
         // Legacy file.
         if (!this->init_scheduler(op_infile.get_value(), {}, {}, op_verbose.get_value(),
-                                  std::move(sched_ops)))
+                                  std::move(sched_ops))) {
             this->success_ = false;
+            return;
+        }
     }
     if (!init_analysis_tools()) {
         this->success_ = false;
@@ -574,6 +627,8 @@ analyzer_multi_tmpl_t<RecordType, ReaderType>::init_dynamic_schedule()
     sched_ops.rebalance_period_us = op_sched_rebalance_period_us.get_value();
     sched_ops.randomize_next_input = op_sched_randomize.get_value();
     sched_ops.honor_direct_switches = !op_sched_disable_direct_switches.get_value();
+    sched_ops.exit_if_fraction_inputs_left =
+        op_sched_exit_if_fraction_inputs_left.get_value();
 #ifdef HAS_ZIP
     if (!op_record_file.get_value().empty()) {
         record_schedule_zip_.reset(new zipfile_ostream_t(op_record_file.get_value()));
diff --git a/clients/drcachesim/common/options.cpp b/clients/drcachesim/common/options.cpp
index 889a0225a7c..3bd7d7c4a03 100644
--- a/clients/drcachesim/common/options.cpp
+++ b/clients/drcachesim/common/options.cpp
@@ -299,13 +299,19 @@ droption_t<std::string> op_v2p_file(
 droption_t<bool> op_cpu_scheduling(
     DROPTION_SCOPE_CLIENT, "cpu_scheduling", false,
     "Map threads to cores matching recorded cpu execution",
-    "By default, the simulator schedules threads to simulated cores in a static "
+    "By default for online analysis, the simulator schedules threads to simulated cores "
+    "in a static "
     "round-robin fashion.  This option causes the scheduler to instead use the recorded "
     "cpu that each thread executed on (at a granularity of the trace buffer size) "
     "for scheduling, mapping traced cpu's to cores and running each segment of each "
     "thread on the core that owns the recorded cpu for that segment. "
     "This option is not supported with -core_serial; use "
-    "-cpu_schedule_file with -core_serial instead.");
+    "-cpu_schedule_file with -core_serial instead.  For offline analysis, the "
+    "recommendation is to not recreate the as-traced schedule (as it is not accurate due "
+    "to overhead) and instead use a dynamic schedule via -core_serial.  If only "
+    "core-sharded-preferring tools are enabled (e.g., " CPU_CACHE ", " TLB
+    ", " SCHEDULE_STATS
+    "), -core_serial is automatically turned on for offline analysis.");
 
 droption_t<bytesize_t> op_max_trace_size(
     DROPTION_SCOPE_CLIENT, "max_trace_size", 0,
@@ -890,11 +896,16 @@ droption_t<int> op_kernel_trace_buffer_size_shift(
 // Core-oriented analysis.
 droption_t<bool> op_core_sharded(
     DROPTION_SCOPE_ALL, "core_sharded", false, "Analyze per-core in parallel.",
-    "By default, the input trace is analyzed in parallel across shards equal to "
-    "software threads.  This option instead schedules those threads onto virtual cores "
+    "By default, the sharding mode is determined by the preferred shard type of the"
+    "tools selected (unless overridden, the default preferred type is thread-sharded). "
+    "This option enables core-sharded, overriding tool defaults.  Core-sharded "
+    "anlysis schedules the input software threads onto virtual cores "
     "and analyzes each core in parallel.  Thus, each shard consists of pieces from "
     "many software threads.  How the scheduling is performed is controlled by a set "
-    "of options with the prefix \"sched_\" along with -cores.");
+    "of options with the prefix \"sched_\" along with -cores.  If only "
+    "core-sharded-preferring tools are enabled (e.g., " CPU_CACHE ", " TLB
+    ", " SCHEDULE_STATS ") and they all support parallel operation, -core_sharded is "
+    "automatically turned on for offline analysis.");
 
 droption_t<bool> op_core_serial(
     DROPTION_SCOPE_ALL, "core_serial", false, "Analyze per-core in serial.",
@@ -902,23 +913,26 @@ droption_t<bool> op_core_serial(
     "However, the resulting schedule is acted upon by a single analysis thread"
     "which walks the N cores in lockstep in round robin fashion. "
     "How the scheduling is performed is controlled by a set "
-    "of options with the prefix \"sched_\" along with -cores.");
+    "of options with the prefix \"sched_\" along with -cores.  If only "
+    "core-sharded-preferring tools are enabled (e.g., " CPU_CACHE ", " TLB
+    ", " SCHEDULE_STATS ") and not all of them support parallel operation, "
+    "-core_serial is automatically turned on for offline analysis.");
 
 droption_t<int64_t>
     // We pick 10 million to match 2 instructions per nanosecond with a 5ms quantum.
     op_sched_quantum(DROPTION_SCOPE_ALL, "sched_quantum", 10 * 1000 * 1000,
                      "Scheduling quantum",
-                     "Applies to -core_sharded and -core_serial. "
-                     "Scheduling quantum in instructions, unless -sched_time is set in "
-                     "which case this value is multiplied by -sched_time_per_us to "
-                     "produce a quantum in wall-clock microseconds.");
+                     "Applies to -core_sharded and -core_serial.  Scheduling quantum in "
+                     "instructions, unless -sched_time is set in which case this value "
+                     "is the quantum in simulated microseconds (equal to wall-clock "
+                     "microseconds multiplied by -sched_time_per_us).");
 
 droption_t<bool>
     op_sched_time(DROPTION_SCOPE_ALL, "sched_time", false,
                   "Whether to use time for the scheduling quantum",
-                  "Applies to -core_sharded and -core_serial. "
-                  "Whether to use wall-clock time for the scheduling quantum, with a "
-                  "value equal to -sched_quantum in microseconds of wall-clock time.");
+                  "Applies to -core_sharded and -core_serial.  Whether to use wall-clock "
+                  "time (multiplied by -sched_time_per_us) for measuring idle time and "
+                  "for the scheduling quantum (see -sched_quantum).");
 
 droption_t<bool> op_sched_order_time(DROPTION_SCOPE_ALL, "sched_order_time", true,
                                      "Whether to honor recorded timestamps for ordering",
@@ -1016,11 +1030,12 @@ droption_t<bool> op_sched_infinite_timeouts(
     "(set to false).");
 
 droption_t<double> op_sched_time_units_per_us(
-    DROPTION_SCOPE_ALL, "sched_time_units_per_us", 100.,
+    DROPTION_SCOPE_ALL, "sched_time_units_per_us", 1000.,
     "Time units per simulated microsecond",
-    "Time units (currently wall-clock time) per simulated microsecond.  This scales all "
-    "of the -sched_*_us values as it converts wall-clock time into the simulated "
-    "microseconds measured by those options.");
+    "Time units per simulated microsecond.  The units are either the instruction count "
+    "plus the idle count (the default) or if -sched_time is selected wall-clock "
+    "microseconds.  This option value scales all of the -sched_*_us values as it "
+    "converts time units into the simulated microseconds measured by those options.");
 
 droption_t<uint64_t> op_sched_migration_threshold_us(
     DROPTION_SCOPE_ALL, "sched_migration_threshold_us", 500,
@@ -1034,6 +1049,18 @@ droption_t<uint64_t> op_sched_rebalance_period_us(
     "The period in simulated microseconds at which per-core run queues are re-balanced "
     "to redistribute load.");
 
+droption_t<double> op_sched_exit_if_fraction_inputs_left(
+    DROPTION_SCOPE_FRONTEND, "sched_exit_if_fraction_inputs_left", 0.1,
+    "Exit if non-EOF inputs left are <= this fraction of the total",
+    "Applies to -core_sharded and -core_serial.  When an input reaches EOF, if the "
+    "number of non-EOF inputs left as a fraction of the original inputs is equal to or "
+    "less than this value then the scheduler exits (sets all outputs to EOF) rather than "
+    "finishing off the final inputs.  This helps avoid long sequences of idles during "
+    "staggered endings with fewer inputs left than cores and only a small fraction of "
+    "the total instructions left in those inputs.  Since the remaining instruction "
+    "count is not considered (as it is not available), use discretion when raising "
+    "this value on uneven inputs.");
+
 // Schedule_stats options.
 droption_t<uint64_t>
     op_schedule_stats_print_every(DROPTION_SCOPE_ALL, "schedule_stats_print_every",
@@ -1099,6 +1126,15 @@ droption_t<std::string>
                        "for the listed function IDs and removes those belonging to "
                        "unlisted function IDs.");
 
+droption_t<std::string> op_modify_marker_value(
+    DROPTION_SCOPE_FRONTEND, "filter_modify_marker_value", "",
+    "Comma-separated pairs of integers representing <TRACE_MARKER_TYPE_, new_value>.",
+    "This option is for -tool " RECORD_FILTER ". It modifies the value of all listed "
+    "TRACE_MARKER_TYPE_ markers in the trace with their corresponding new_value. "
+    "The list must have an even size. Example: -filter_modify_marker_value 3,24,18,2048 "
+    "sets all TRACE_MARKER_TYPE_CPU_ID == 3 in the trace to core 24 and "
+    "TRACE_MARKER_TYPE_PAGE_SIZE == 18 to 2k.");
+
 droption_t<uint64_t> op_trim_before_timestamp(
     DROPTION_SCOPE_ALL, "trim_before_timestamp", 0, 0,
     (std::numeric_limits<uint64_t>::max)(),
diff --git a/clients/drcachesim/common/options.h b/clients/drcachesim/common/options.h
index 75633a1928c..9e64dd920b3 100644
--- a/clients/drcachesim/common/options.h
+++ b/clients/drcachesim/common/options.h
@@ -221,6 +221,7 @@ extern dynamorio::droption::droption_t<bool> op_sched_infinite_timeouts;
 extern dynamorio::droption::droption_t<uint64_t> op_sched_migration_threshold_us;
 extern dynamorio::droption::droption_t<uint64_t> op_sched_rebalance_period_us;
 extern dynamorio::droption::droption_t<double> op_sched_time_units_per_us;
+extern dynamorio::droption::droption_t<double> op_sched_exit_if_fraction_inputs_left;
 extern dynamorio::droption::droption_t<uint64_t> op_schedule_stats_print_every;
 extern dynamorio::droption::droption_t<std::string> op_syscall_template_file;
 extern dynamorio::droption::droption_t<uint64_t> op_filter_stop_timestamp;
@@ -229,6 +230,7 @@ extern dynamorio::droption::droption_t<std::string> op_filter_trace_types;
 extern dynamorio::droption::droption_t<std::string> op_filter_marker_types;
 extern dynamorio::droption::droption_t<bool> op_encodings2regdeps;
 extern dynamorio::droption::droption_t<std::string> op_filter_func_ids;
+extern dynamorio::droption::droption_t<std::string> op_modify_marker_value;
 extern dynamorio::droption::droption_t<uint64_t> op_trim_before_timestamp;
 extern dynamorio::droption::droption_t<uint64_t> op_trim_after_timestamp;
 extern dynamorio::droption::droption_t<bool> op_abort_on_invariant_error;
diff --git a/clients/drcachesim/common/trace_entry.h b/clients/drcachesim/common/trace_entry.h
index 1b97224485a..903374ddd00 100644
--- a/clients/drcachesim/common/trace_entry.h
+++ b/clients/drcachesim/common/trace_entry.h
@@ -699,6 +699,9 @@ typedef enum {
     // Values below here are available for users to use for custom markers.
 } trace_marker_type_t;
 
+// As documented in TRACE_MARKER_TYPE_CPU_ID, this value indicates an unknown CPU.
+#define INVALID_CPU_MARKER_VALUE static_cast<uintptr_t>(-1)
+
 /** Constants related to function or system call parameter tracing. */
 enum class func_trace_t : uint64_t { // VS2019 won't infer 64-bit with "enum {".
 /**
diff --git a/clients/drcachesim/docs/drcachesim.dox.in b/clients/drcachesim/docs/drcachesim.dox.in
index 406e3293530..82ecaead2a9 100644
--- a/clients/drcachesim/docs/drcachesim.dox.in
+++ b/clients/drcachesim/docs/drcachesim.dox.in
@@ -179,9 +179,9 @@ Some of the more important markers are:
 
 - #dynamorio::drmemtrace::TRACE_MARKER_TYPE_KERNEL_XFER - This identifies a system call that changes control flow, such as a signal return.
 
-- #dynamorio::drmemtrace::TRACE_MARKER_TYPE_TIMESTAMP - The marker value provides a timestamp for this point of the trace (in units of microseconds since Jan 1, 1601 UTC). This value can be used to synchronize records from different threads as well as analyze latencies (however, tracing overhead inflates time unevenly, so time deltas should not be considered perfectly representative). It is used in the sequential analysis of a multi-threaded trace.
+- #dynamorio::drmemtrace::TRACE_MARKER_TYPE_TIMESTAMP - The marker value provides a timestamp for this point of the trace (in units of microseconds since Jan 1, 1601 UTC). This value can be used to synchronize records from different threads as well as analyze latencies (however, tracing overhead inflates time unevenly, so time deltas should not be considered perfectly representative). It is used in the sequential analysis of a multi-threaded trace.  When dynamic scheduling is performed (see \ref sec_drcachesim_sched_dynamic), the original timestamp value is replaced with a synthetic value in order to maintain relative ordering (though the new values should not be relied upon to indicate accurate durations between events).
 
-- #dynamorio::drmemtrace::TRACE_MARKER_TYPE_CPU_ID - The marker value contains the CPU identifier on which the subsequent records were collected.  It can be used to identify the "as traced" schedule, indicating which threads were on which cores at which times.  However, this schedule is not representative and should not be treated as indicating how the application behaves without tracing.  See \ref sec_drcachesim_as_traced for further information.
+- #dynamorio::drmemtrace::TRACE_MARKER_TYPE_CPU_ID - The marker value contains the CPU identifier on which the subsequent records were collected.  It can be used to identify the "as traced" schedule, indicating which threads were on which cores at which times.  However, this schedule is not representative and should not be treated as indicating how the application behaves without tracing.  See \ref sec_drcachesim_as_traced for further information.  When dynamic scheduling is performed (see \ref sec_drcachesim_sched_dynamic), the original value of this marker is replaced with the new virtual core identifier.
 
 - #dynamorio::drmemtrace::TRACE_MARKER_TYPE_FUNC_ID, #dynamorio::drmemtrace::TRACE_MARKER_TYPE_FUNC_RETADDR, #dynamorio::drmemtrace::TRACE_MARKER_TYPE_FUNC_ARG, #dynamorio::drmemtrace::TRACE_MARKER_TYPE_FUNC_RETVAL - These markers are used to capture information about function calls.  Which functions to capture must be explicitly selected at tracing time.  Typical candiates are heap allocation and freeing functions.  See \ref sec_drcachesim_funcs.
 
@@ -1292,8 +1292,8 @@ Neither simulator has a simple way to know which core any particular thread
 executed on for each of its instructions.  The tracer records which core a
 thread is on each time it writes out a full trace buffer, giving an
 approximation of the actual scheduling: but this is not representative
-due to overhead (see \ref sec_drcachesim_as_traced).  By default, these cache and TLB
-simulators ignore that
+due to overhead (see \ref sec_drcachesim_as_traced).  For online analysis, by default,
+these cache and TLB simulators ignore that
 information and schedule threads to simulated cores in a static round-robin
 fashion with load balancing to fill in gaps with new threads after threads
 exit.  The option "-cpu_scheduling" (see \ref sec_drcachesim_ops) can be
@@ -1301,13 +1301,15 @@ used to instead map each physical cpu to a simulated core and use the
 recorded cpu that each segment of thread execution occurred on to schedule
 execution following the "as traced" schedule, but as just noted this is not
 representative.  Instead, we recommend using offline traces and dynamic
-re-scheduling as explained in \ref sec_drcachesim_sched_dynamic using the
-`-core_serial` parameter.  Here is an example:
+re-scheduling in core-sharded mode as explained in \ref sec_drcachesim_sched_dynamic
+using the
+`-core_serial` parameter.  In offline mode, `-core_serial` is the default for
+these simulators.
 
 \code
 $ bin64/drrun -t drmemtrace -offline -- ~/test/pi_estimator 8 20
 Estimation of pi is 3.141592653798125
-$ bin64/drrun -t drcachesim -core_serial -cores 3 -indir drmemtrace.pi_estimator.*.dir
+$ bin64/drrun -t drcachesim -cores 3 -indir drmemtrace.pi_estimator.*.dir
 Cache simulation results:
 Core #0 (traced CPU(s): #0)
   L1I0 (size=32768, assoc=8, block=64, LRU) stats:
@@ -1469,8 +1471,13 @@ While we suggest keeping traces stored as thread-sharded and using the dynamic s
 in each run, there is support for running the scheduler once and creating a new set of
 stored traces in core-sharded format: essentially switching to hardware-thread-oriented
 traces.  This is done using the \ref sec_tool_record_filter tool in `-core_sharded` mode.
-The #dynamorio::drmemtrace::TRACE_MARKER_TYPE_CPU_ID markers are not modified by the
-dynamic scheduler, and should be ignored in a newly created core-sharded trace.
+The #dynamorio::drmemtrace::TRACE_MARKER_TYPE_TIMESTAMP and
+#dynamorio::drmemtrace::TRACE_MARKER_TYPE_CPU_ID markers are modified by the dynamic
+scheduler to reflect the new schedule.  The new timestamps maintain relative ordering
+but should not be relied upon to indicate accurate durations between events.
+When analyzing core-sharded-on-disk traces, `-no_core_sharded` must be passed when
+using core-sharded-preferring tools to avoid an error from the framework attempting
+to re-schedule the already-scheduled trace.
 
 Traces also include markers indicating disruptions in user mode control
 flow such as signal handler entry and exit.
@@ -1510,7 +1517,9 @@ the framework controls the iteration), to request the next trace
 record for each output on its own.  This scheduling is also available to any analysis tool
 when the input traces are sharded by core (see the `-core_sharded` and `-core_serial`
 and various `-sched_*` option documentation under \ref sec_drcachesim_ops as well as
-core-sharded notes when \ref sec_drcachesim_newtool).
+core-sharded notes when \ref sec_drcachesim_newtool), and in fact is the
+default when all tools prefer core-sharded operation via
+#dynamorio::drmemtrace::analysis_tool_t::preferred_shard_type().
 
 ********************
 \section sec_drcachesim_as_traced As-Traced Schedule Limitations
@@ -1535,7 +1544,9 @@ through the -cpu_schedule_file option.
 \section sec_drcachesim_sched_dynamic Dynamic Scheduling
 
 Instead of using the as-traced schedule, we recommend re-scheduling the traced software
-threads using our trace scheduler.  Our scheduler essentially serves as an operating
+threads using our trace scheduler in
+#dynamorio::drmemtrace::scheduler_t::MAP_TO_ANY_OUTPUT mode.
+Our scheduler essentially serves as an operating
 system scheduler for this purpose, though using simpler schemes.  It models separate
 runqueues per core with support for binding inputs to certain cores, priorities, idle
 time from blocking system calls, migration thresholds, rebalancing runqueues, etc.  It
@@ -1564,8 +1575,10 @@ The downsides include:
   thus, timestamp ordering will be followed at context switch points for picking the
   next input, but timestamps will not preempt an input.
 
-The #dynamorio::drmemtrace::TRACE_MARKER_TYPE_CPU_ID markers are not modified by the
-dynamic scheduler, and should be ignored in a dynamic rescheduling.
+The #dynamorio::drmemtrace::TRACE_MARKER_TYPE_TIMESTAMP and
+#dynamorio::drmemtrace::TRACE_MARKER_TYPE_CPU_ID markers are modified by the dynamic
+scheduler to reflect the new schedule.  The new timestamps maintain relative ordering
+but should not be relied upon to indicate accurate durations between events.
 
 ********************
 \section sec_drcachesim_sched_time Simulated Time
diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp
index e9ae6114265..18c1a88e6dc 100644
--- a/clients/drcachesim/scheduler/scheduler.cpp
+++ b/clients/drcachesim/scheduler/scheduler.cpp
@@ -118,7 +118,8 @@ replay_file_checker_t::check(archive_istream_t *infile)
     scheduler_t::schedule_record_t record;
     bool prev_was_idle = false;
     while (infile->read(reinterpret_cast<char *>(&record), sizeof(record))) {
-        if (record.type == scheduler_t::schedule_record_t::IDLE) {
+        if (record.type == scheduler_t::schedule_record_t::IDLE ||
+            record.type == scheduler_t::schedule_record_t::IDLE_BY_COUNT) {
             if (prev_was_idle)
                 return "Error: consecutive idle records";
             prev_was_idle = true;
@@ -277,6 +278,17 @@ scheduler_tmpl_t<memref_t, reader_t>::record_type_is_non_marker_header(memref_t
     return false;
 }
 
+template <>
+bool
+scheduler_tmpl_t<memref_t, reader_t>::record_type_set_marker_value(memref_t &record,
+                                                                   uintptr_t value)
+{
+    if (record.marker.type != TRACE_TYPE_MARKER)
+        return false;
+    record.marker.marker_value = value;
+    return true;
+}
+
 template <>
 bool
 scheduler_tmpl_t<memref_t, reader_t>::record_type_is_timestamp(memref_t record,
@@ -472,6 +484,17 @@ scheduler_tmpl_t<trace_entry_t, record_reader_t>::record_type_is_instr_boundary(
         !record_reader_t::record_is_pre_instr(&prev_record);
 }
 
+template <>
+bool
+scheduler_tmpl_t<trace_entry_t, record_reader_t>::record_type_set_marker_value(
+    trace_entry_t &record, uintptr_t value)
+{
+    if (record.type != TRACE_TYPE_MARKER)
+        return false;
+    record.addr = value;
+    return true;
+}
+
 template <>
 bool
 scheduler_tmpl_t<trace_entry_t, record_reader_t>::record_type_is_timestamp(
@@ -725,6 +748,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::print_configuration()
            options_.rebalance_period_us);
     VPRINT(this, 1, "  %-25s : %d\n", "honor_infinite_timeouts",
            options_.honor_infinite_timeouts);
+    VPRINT(this, 1, "  %-25s : %f\n", "exit_if_fraction_inputs_left",
+           options_.exit_if_fraction_inputs_left);
 }
 
 template <typename RecordType, typename ReaderType>
@@ -1027,6 +1052,11 @@ scheduler_tmpl_t<RecordType, ReaderType>::legacy_field_support()
         error_string_ = "block_time_max_us must be > 0";
         return STATUS_ERROR_INVALID_PARAMETER;
     }
+    if (options_.exit_if_fraction_inputs_left < 0. ||
+        options_.exit_if_fraction_inputs_left > 1.) {
+        error_string_ = "exit_if_fraction_inputs_left must be 0..1";
+        return STATUS_ERROR_INVALID_PARAMETER;
+    }
     return STATUS_SUCCESS;
 }
 
@@ -1298,10 +1328,17 @@ scheduler_tmpl_t<RecordType, ReaderType>::read_recorded_schedule()
             VPRINT(this, 1, "output %d empty: returning eof up front\n", i);
             set_cur_input(i, INVALID_INPUT_ORDINAL);
             outputs_[i].at_eof = true;
-        } else if (outputs_[i].record[0].type == schedule_record_t::IDLE) {
+        } else if (outputs_[i].record[0].type == schedule_record_t::IDLE ||
+                   outputs_[i].record[0].type == schedule_record_t::IDLE_BY_COUNT) {
             set_cur_input(i, INVALID_INPUT_ORDINAL);
             outputs_[i].waiting = true;
-            outputs_[i].wait_start_time = 0; // Updated on first next_record().
+            if (outputs_[i].record[0].type == schedule_record_t::IDLE) {
+                // Convert a legacy idle duration from microseconds to record counts.
+                outputs_[i].record[0].value.idle_duration =
+                    static_cast<uint64_t>(options_.time_units_per_us *
+                                          outputs_[i].record[0].value.idle_duration);
+            }
+            outputs_[i].idle_start_count = -1; // Updated on first next_record().
             VPRINT(this, 3, "output %d starting out idle\n", i);
         } else {
             assert(outputs_[i].record[0].type == schedule_record_t::DEFAULT);
@@ -2339,7 +2376,11 @@ scheduler_tmpl_t<RecordType, ReaderType>::advance_region_of_interest(
                         return status;
                 }
                 input.queue.push_back(create_thread_exit(input.tid));
-                mark_input_eof(input);
+                sched_type_t::stream_status_t status = mark_input_eof(input);
+                // For early EOF we still need our synthetic exit so do not return it yet.
+                if (status != sched_type_t::STATUS_OK &&
+                    status != sched_type_t::STATUS_EOF)
+                    return status;
                 return sched_type_t::STATUS_SKIPPED;
             }
         }
@@ -2466,7 +2507,9 @@ scheduler_tmpl_t<RecordType, ReaderType>::skip_instructions(input_info_t &input,
         input.instrs_pre_read = 0;
     }
     if (*input.reader == *input.reader_end) {
-        mark_input_eof(input);
+        sched_type_t::stream_status_t status = mark_input_eof(input);
+        if (status != sched_type_t::STATUS_OK)
+            return status;
         // Raise error because the input region is out of bounds, unless the max
         // was used which we ourselves use internally for times_of_interest.
         if (skip_amount >= std::numeric_limits<uint64_t>::max() - 2) {
@@ -2518,17 +2561,25 @@ scheduler_tmpl_t<RecordType, ReaderType>::record_schedule_segment(
     input_ordinal_t input, uint64_t start_instruction, uint64_t stop_instruction)
 {
     assert(type == schedule_record_t::VERSION || type == schedule_record_t::FOOTER ||
-           type == schedule_record_t::IDLE || inputs_[input].lock->owned_by_cur_thread());
+           // ::IDLE is a legacy type we should not see in new recordings.
+           type == schedule_record_t::IDLE_BY_COUNT ||
+           inputs_[input].lock->owned_by_cur_thread());
     // We always use the current wall-clock time, as the time stored in the prior
     // next_record() call can be out of order across outputs and lead to deadlocks.
     uint64_t timestamp = get_time_micros();
-    if (type == schedule_record_t::IDLE &&
-        outputs_[output].record.back().type == schedule_record_t::IDLE) {
+    if (type == schedule_record_t::IDLE_BY_COUNT &&
+        outputs_[output].record.back().type == schedule_record_t::IDLE_BY_COUNT) {
         // Merge.  We don't need intermediate timestamps when idle, and consecutive
         // idle records quickly balloon the file.
         return sched_type_t::STATUS_OK;
     }
-    VPRINT(this, 4,
+    if (type == schedule_record_t::IDLE_BY_COUNT) {
+        // Start prior to this idle.
+        outputs_[output].idle_start_count = outputs_[output].idle_count - 1;
+        // That is what we'll record in the value union shared w/ start_instruction.
+        assert(start_instruction == outputs_[output].idle_count - 1);
+    }
+    VPRINT(this, 3,
            "recording out=%d type=%d input=%d start=%" PRIu64 " stop=%" PRIu64
            " time=%" PRIu64 "\n",
            output, type, input, start_instruction, stop_instruction, timestamp);
@@ -2547,24 +2598,25 @@ scheduler_tmpl_t<RecordType, ReaderType>::close_schedule_segment(output_ordinal_
     assert(!outputs_[output].record.empty());
     assert(outputs_[output].record.back().type == schedule_record_t::VERSION ||
            outputs_[output].record.back().type == schedule_record_t::FOOTER ||
-           outputs_[output].record.back().type == schedule_record_t::IDLE ||
+           // ::IDLE is for legacy recordings, not new ones.
+           outputs_[output].record.back().type == schedule_record_t::IDLE_BY_COUNT ||
            input.lock->owned_by_cur_thread());
     if (outputs_[output].record.back().type == schedule_record_t::SKIP) {
         // Skips already have a final stop value.
         return sched_type_t::STATUS_OK;
     }
-    if (outputs_[output].record.back().type == schedule_record_t::IDLE) {
-        // Just like in record_schedule_segment() we use wall-clock time for recording
-        // replay timestamps.
-        uint64_t end = get_time_micros();
-        assert(end >= outputs_[output].record.back().timestamp);
+    if (outputs_[output].record.back().type == schedule_record_t::IDLE_BY_COUNT) {
+        uint64_t end_idle_count = outputs_[output].idle_count;
+        assert(outputs_[output].idle_start_count >= 0);
+        assert(end_idle_count >=
+               static_cast<uint64_t>(outputs_[output].idle_start_count));
         outputs_[output].record.back().value.idle_duration =
-            end - outputs_[output].record.back().timestamp;
+            end_idle_count - outputs_[output].idle_start_count;
         VPRINT(this, 3,
-               "close_schedule_segment: idle duration %" PRIu64 " = %" PRIu64
+               "close_schedule_segment[%d]: idle duration %" PRIu64 " = %" PRIu64
                " - %" PRIu64 "\n",
-               outputs_[output].record.back().value.idle_duration, end,
-               outputs_[output].record.back().timestamp);
+               output, outputs_[output].record.back().value.idle_duration, end_idle_count,
+               outputs_[output].idle_start_count);
         return sched_type_t::STATUS_OK;
     }
     uint64_t instr_ord = get_instr_ordinal(input);
@@ -2583,9 +2635,9 @@ scheduler_tmpl_t<RecordType, ReaderType>::close_schedule_segment(output_ordinal_
         ++instr_ord;
     }
     VPRINT(this, 3,
-           "close_schedule_segment: input=%d type=%d start=%" PRIu64 " stop=%" PRIu64
+           "close_schedule_segment[%d]: input=%d type=%d start=%" PRIu64 " stop=%" PRIu64
            "\n",
-           input.index, outputs_[output].record.back().type,
+           output, input.index, outputs_[output].record.back().type,
            outputs_[output].record.back().value.start_instruction, instr_ord);
     // Check for empty default entries, except the starter 0,0 ones.
     assert(outputs_[output].record.back().type != schedule_record_t::DEFAULT ||
@@ -2708,8 +2760,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::pop_from_ready_queue_hold_locks(
             if (res->blocked_time > 0 &&
                 // cur_time can be 0 at initialization time.
                 (cur_time == 0 ||
-                 // XXX i#6966: We have seen wall-clock time go backward, which
-                 // underflows here and then always unblocks the input.
+                 // Guard against time going backward (happens for wall-clock: i#6966).
+                 cur_time < res->blocked_start_time ||
                  cur_time - res->blocked_start_time < res->blocked_time)) {
                 VPRINT(this, 4, "pop queue: %d still blocked for %" PRIu64 "\n",
                        res->index,
@@ -2720,28 +2772,42 @@ scheduler_tmpl_t<RecordType, ReaderType>::pop_from_ready_queue_hold_locks(
                 // This input is no longer blocked.
                 res->blocked_time = 0;
                 res->unscheduled = false;
+                VPRINT(this, 4, "pop queue: %d @ %" PRIu64 " no longer blocked\n",
+                       res->index, cur_time);
                 // We've found a candidate.  One final check if this is a migration.
                 bool found_candidate = false;
                 if (from_output == for_output)
                     found_candidate = true;
                 else {
                     assert(cur_time > 0 || res->last_run_time == 0);
+                    if (res->last_run_time == 0) {
+                        // For never-executed inputs we consider their last execution
+                        // to be the very first simulation time, which we can't
+                        // easily initialize until here.
+                        res->last_run_time = outputs_[from_output].initial_cur_time->load(
+                            std::memory_order_acquire);
+                    }
                     VPRINT(this, 5,
                            "migration check %d to %d: cur=%" PRIu64 " last=%" PRIu64
                            " delta=%" PRId64 " vs thresh %" PRIu64 "\n",
                            from_output, for_output, cur_time, res->last_run_time,
                            cur_time - res->last_run_time,
                            options_.migration_threshold_us);
-                    // Guard against time going backward, which happens: i#6966.
-                    if (options_.migration_threshold_us == 0 || res->last_run_time == 0 ||
+                    // Guard against time going backward (happens for wall-clock: i#6966).
+                    if (options_.migration_threshold_us == 0 ||
+                        // Allow free movement for the initial load balance at init time.
+                        cur_time == 0 ||
                         (cur_time > res->last_run_time &&
                          cur_time - res->last_run_time >=
                              static_cast<uint64_t>(options_.migration_threshold_us *
                                                    options_.time_units_per_us))) {
                         VPRINT(this, 2, "migrating %d to %d\n", from_output, for_output);
                         found_candidate = true;
-                        ++outputs_[from_output]
-                              .stats[memtrace_stream_t::SCHED_STAT_MIGRATIONS];
+                        // Do not count an initial rebalance as a migration.
+                        if (cur_time > 0) {
+                            ++outputs_[from_output]
+                                  .stats[memtrace_stream_t::SCHED_STAT_MIGRATIONS];
+                        }
                     }
                 }
                 if (found_candidate)
@@ -2758,6 +2824,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::pop_from_ready_queue_hold_locks(
     if (res == nullptr && !blocked.empty()) {
         // Do not hand out EOF thinking we're done: we still have inputs blocked
         // on i/o, so just wait and retry.
+        if (for_output != INVALID_OUTPUT_ORDINAL)
+            ++outputs_[for_output].idle_count;
         status = STATUS_IDLE;
     }
     // Re-add the ones we skipped, but without changing their counters so we preserve
@@ -2910,7 +2978,9 @@ scheduler_tmpl_t<RecordType, ReaderType>::set_cur_input(output_ordinal_t output,
             add_to_ready_queue(output, &inputs_[prev_input]);
         }
     } else if (options_.schedule_record_ostream != nullptr &&
-               outputs_[output].record.back().type == schedule_record_t::IDLE) {
+               (outputs_[output].record.back().type == schedule_record_t::IDLE ||
+                outputs_[output].record.back().type ==
+                    schedule_record_t::IDLE_BY_COUNT)) {
         input_info_t unused;
         sched_type_t::stream_status_t status = close_schedule_segment(output, unused);
         if (status != sched_type_t::STATUS_OK)
@@ -3021,18 +3091,27 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input_as_previously(
         }
         return eof_or_idle(output, outputs_[output].cur_input);
     }
-    const schedule_record_t &segment = outputs_[output].record[record_index + 1];
-    if (segment.type == schedule_record_t::IDLE) {
+    schedule_record_t &segment = outputs_[output].record[record_index + 1];
+    if (segment.type == schedule_record_t::IDLE ||
+        segment.type == schedule_record_t::IDLE_BY_COUNT) {
         outputs_[output].waiting = true;
-        outputs_[output].wait_start_time = get_output_time(output);
+        if (segment.type == schedule_record_t::IDLE) {
+            // Convert a legacy idle duration from microseconds to record counts.
+            segment.value.idle_duration = static_cast<uint64_t>(
+                options_.time_units_per_us * segment.value.idle_duration);
+        }
+        outputs_[output].idle_start_count = outputs_[output].idle_count;
         outputs_[output].record_index->fetch_add(1, std::memory_order_release);
+        ++outputs_[output].idle_count;
+        VPRINT(this, 5, "%s[%d]: next replay segment idle for %" PRIu64 "\n",
+               __FUNCTION__, output, segment.value.idle_duration);
         return sched_type_t::STATUS_IDLE;
     }
     index = segment.key.input;
     VPRINT(this, 5,
-           "pick_next_input_as_previously[%d]: next replay segment in=%d (@%" PRId64
-           ") type=%d start=%" PRId64 " end=%" PRId64 "\n",
-           output, index, get_instr_ordinal(inputs_[index]), segment.type,
+           "%s[%d]: next replay segment in=%d (@%" PRId64 ") type=%d start=%" PRId64
+           " end=%" PRId64 "\n",
+           __FUNCTION__, output, index, get_instr_ordinal(inputs_[index]), segment.type,
            segment.value.start_instruction, segment.stop_instruction);
     {
         std::lock_guard<mutex_dbg_owned> lock(*inputs_[index].lock);
@@ -3109,11 +3188,13 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input_as_previously(
         // queued candidate record, if any.
         clear_input_queue(inputs_[index]);
         inputs_[index].queue.push_back(create_thread_exit(inputs_[index].tid));
-        mark_input_eof(inputs_[index]);
         VPRINT(this, 2, "early end for input %d\n", index);
         // We're done with this entry but we need the queued record to be read,
         // so we do not move past the entry.
         outputs_[output].record_index->fetch_add(1, std::memory_order_release);
+        sched_type_t::stream_status_t status = mark_input_eof(inputs_[index]);
+        if (status != sched_type_t::STATUS_OK)
+            return status;
         return sched_type_t::STATUS_SKIPPED;
     } else if (segment.type == schedule_record_t::SKIP) {
         std::lock_guard<mutex_dbg_owned> lock(*inputs_[index].lock);
@@ -3164,9 +3245,14 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input(output_ordinal_t outpu
                                                           uint64_t blocked_time)
 {
     VDO(this, 1, {
-        static int global_heartbeat;
+        static int64_t global_heartbeat;
+        // 10K is too frequent for simple analyzer runs: it is too noisy with
+        // the new core-sharded-by-default for new users using defaults.
+        // 50K is a reasonable compromise.
+        // XXX: Add a runtime option to tweak this.
+        static constexpr int64_t GLOBAL_HEARTBEAT_CADENCE = 50000;
         // We are ok with races as the cadence is approximate.
-        if (++global_heartbeat % 10000 == 0) {
+        if (++global_heartbeat % GLOBAL_HEARTBEAT_CADENCE == 0) {
             print_queue_stats();
         }
     });
@@ -3384,7 +3470,9 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input(output_ordinal_t outpu
                             if (options_.schedule_record_ostream != nullptr) {
                                 sched_type_t::stream_status_t record_status =
                                     record_schedule_segment(
-                                        output, schedule_record_t::IDLE, 0, 0, 0);
+                                        output, schedule_record_t::IDLE_BY_COUNT, 0,
+                                        // Start prior to this idle.
+                                        outputs_[output].idle_count - 1, 0);
                                 if (record_status != sched_type_t::STATUS_OK)
                                     return record_status;
                             }
@@ -3456,8 +3544,11 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input(output_ordinal_t outpu
         if (inputs_[index].at_eof ||
             *inputs_[index].reader == *inputs_[index].reader_end) {
             VPRINT(this, 2, "next_record[%d]: input #%d at eof\n", output, index);
-            if (!inputs_[index].at_eof)
-                mark_input_eof(inputs_[index]);
+            if (!inputs_[index].at_eof) {
+                sched_type_t::stream_status_t status = mark_input_eof(inputs_[index]);
+                if (status != sched_type_t::STATUS_OK)
+                    return status;
+            }
             index = INVALID_INPUT_ORDINAL;
             // Loop and pick next thread.
             continue;
@@ -3711,34 +3802,39 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
     // do return an error on a time smaller than an input's current start time when we
     // check for quantum end.
     if (cur_time == 0) {
-        // It's more efficient for QUANTUM_INSTRUCTIONS to get the time here instead of
-        // in get_output_time().  This also makes the two more similarly behaved with
-        // respect to blocking system calls.
-        // TODO i#6971: Use INSTRS_PER_US to replace .cur_time completely
-        // with a counter-based time, weighted appropriately for STATUS_IDLE.
-        cur_time = get_time_micros();
+        // We add 1 to avoid an invalid value of 0.
+        cur_time = 1 + outputs_[output].stream->get_output_instruction_ordinal() +
+            outputs_[output].idle_count;
+    }
+    if (outputs_[output].initial_cur_time->load(std::memory_order_acquire) == 0) {
+        outputs_[output].initial_cur_time->store(cur_time, std::memory_order_release);
     }
     // Invalid values for cur_time are checked below.
     outputs_[output].cur_time->store(cur_time, std::memory_order_release);
-    if (!outputs_[output].active->load(std::memory_order_acquire))
+    if (!outputs_[output].active->load(std::memory_order_acquire)) {
+        ++outputs_[output].idle_count;
         return sched_type_t::STATUS_IDLE;
+    }
     if (outputs_[output].waiting) {
         if (options_.mapping == MAP_AS_PREVIOUSLY &&
-            outputs_[output].wait_start_time > 0) {
+            outputs_[output].idle_start_count >= 0) {
             uint64_t duration = outputs_[output]
                                     .record[outputs_[output].record_index->load(
                                         std::memory_order_acquire)]
                                     .value.idle_duration;
-            uint64_t now = get_output_time(output);
-            if (now - outputs_[output].wait_start_time < duration) {
+            uint64_t now = outputs_[output].idle_count;
+            if (now - outputs_[output].idle_start_count < duration) {
                 VPRINT(this, 4,
                        "next_record[%d]: elapsed %" PRIu64 " < duration %" PRIu64 "\n",
-                       output, now - outputs_[output].wait_start_time, duration);
-                return sched_type_t::STATUS_WAIT;
+                       output, now - outputs_[output].idle_start_count, duration);
+                ++outputs_[output].idle_count;
+                return sched_type_t::STATUS_IDLE;
             } else
-                outputs_[output].wait_start_time = 0;
+                outputs_[output].idle_start_count = -1;
         }
-        VPRINT(this, 5, "next_record[%d]: need new input (cur=waiting)\n", output);
+        VPRINT(this, 5,
+               "next_record[%d]: need new input (cur=waiting; idles=%" PRIu64 ")\n",
+               output, outputs_[output].idle_count);
         sched_type_t::stream_status_t res = pick_next_input(output, 0);
         if (res != sched_type_t::STATUS_OK && res != sched_type_t::STATUS_SKIPPED)
             return res;
@@ -3802,8 +3898,11 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
                 input->needs_advance = true;
             }
             if (input->at_eof || *input->reader == *input->reader_end) {
-                if (!input->at_eof)
-                    mark_input_eof(*input);
+                if (!input->at_eof) {
+                    sched_type_t::stream_status_t status = mark_input_eof(*input);
+                    if (status != sched_type_t::STATUS_OK)
+                        return status;
+                }
                 lock.unlock();
                 VPRINT(this, 5, "next_record[%d]: need new input (cur=%d eof)\n", output,
                        input->index);
@@ -4081,6 +4180,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
     }
     VPRINT(this, 4, "next_record[%d]: from %d @%" PRId64 ": ", output, input->index,
            cur_time);
+    update_next_record(output, record);
     VDO(this, 4, print_record(record););
 
     outputs_[output].last_record = record;
@@ -4089,6 +4189,58 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
     return sched_type_t::STATUS_OK;
 }
 
+template <typename RecordType, typename ReaderType>
+void
+scheduler_tmpl_t<RecordType, ReaderType>::update_next_record(output_ordinal_t output,
+                                                             RecordType &record)
+{
+    if (options_.mapping != MAP_TO_ANY_OUTPUT && options_.mapping != MAP_AS_PREVIOUSLY)
+        return; // Nothing to do.
+    if (options_.replay_as_traced_istream != nullptr) {
+        // Do not modify MAP_TO_RECORDED_OUTPUT (turned into MAP_AS_PREVIOUSLY).
+        return;
+    }
+    // For a dynamic schedule, the as-traced cpuids and timestamps no longer
+    // apply and are just confusing (causing problems like interval analysis
+    // failures), so we replace them.
+    trace_marker_type_t type;
+    uintptr_t value;
+    if (!record_type_is_marker(record, type, value))
+        return; // Nothing to do.
+    if (type == TRACE_MARKER_TYPE_TIMESTAMP) {
+        if (outputs_[output].base_timestamp == 0) {
+            // Record the first input's first timestamp, as a base value.
+#ifndef NDEBUG
+            bool ok =
+#endif
+                record_type_is_timestamp(record, outputs_[output].base_timestamp);
+            assert(ok);
+            assert(outputs_[output].base_timestamp != 0);
+            VPRINT(this, 2, "output %d base timestamp = %zu\n", output,
+                   outputs_[output].base_timestamp);
+        }
+        uint64_t instr_ord = outputs_[output].stream->get_instruction_ordinal();
+        uint64_t idle_count = outputs_[output].idle_count;
+        uintptr_t new_time = static_cast<uintptr_t>(
+            outputs_[output].base_timestamp + (instr_ord + idle_count) / INSTRS_PER_US);
+        VPRINT(this, 4,
+               "New time in output %d: %zu from base %zu and instrs %" PRIu64
+               " idles %" PRIu64 "\n",
+               output, new_time, outputs_[output].base_timestamp, instr_ord, idle_count);
+#ifndef NDEBUG
+        bool ok =
+#endif
+            record_type_set_marker_value(record, new_time);
+        assert(ok);
+    } else if (type == TRACE_MARKER_TYPE_CPU_ID) {
+#ifndef NDEBUG
+        bool ok =
+#endif
+            record_type_set_marker_value(record, get_shard_index(output));
+        assert(ok);
+    }
+}
+
 template <typename RecordType, typename ReaderType>
 typename scheduler_tmpl_t<RecordType, ReaderType>::stream_status_t
 scheduler_tmpl_t<RecordType, ReaderType>::unread_last_record(output_ordinal_t output,
@@ -4167,17 +4319,28 @@ scheduler_tmpl_t<RecordType, ReaderType>::stop_speculation(output_ordinal_t outp
 }
 
 template <typename RecordType, typename ReaderType>
-void
+typename scheduler_tmpl_t<RecordType, ReaderType>::stream_status_t
 scheduler_tmpl_t<RecordType, ReaderType>::mark_input_eof(input_info_t &input)
 {
     assert(input.lock->owned_by_cur_thread());
     if (input.at_eof)
-        return;
+        return sched_type_t::STATUS_OK;
     input.at_eof = true;
-    assert(live_input_count_.load(std::memory_order_acquire) > 0);
-    live_input_count_.fetch_add(-1, std::memory_order_release);
-    VPRINT(this, 2, "input %d at eof; %d live inputs left\n", input.index,
-           live_input_count_.load(std::memory_order_acquire));
+#ifndef NDEBUG
+    int old_count =
+#endif
+        live_input_count_.fetch_add(-1, std::memory_order_release);
+    assert(old_count > 0);
+    int live_inputs = live_input_count_.load(std::memory_order_acquire);
+    VPRINT(this, 2, "input %d at eof; %d live inputs left\n", input.index, live_inputs);
+    if (options_.mapping == MAP_TO_ANY_OUTPUT &&
+        live_inputs <=
+            static_cast<int>(inputs_.size() * options_.exit_if_fraction_inputs_left)) {
+        VPRINT(this, 1, "exiting early at input %d with %d live inputs left\n",
+               input.index, live_inputs);
+        return sched_type_t::STATUS_EOF;
+    }
+    return sched_type_t::STATUS_OK;
 }
 
 template <typename RecordType, typename ReaderType>
@@ -4188,8 +4351,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::eof_or_idle(output_ordinal_t output,
     // XXX i#6831: Refactor to use subclasses or templates to specialize
     // scheduler code based on mapping options, to avoid these top-level
     // conditionals in many functions?
-    if (options_.mapping == MAP_TO_CONSISTENT_OUTPUT ||
-        live_input_count_.load(std::memory_order_acquire) == 0 ||
+    int live_inputs = live_input_count_.load(std::memory_order_acquire);
+    if (options_.mapping == MAP_TO_CONSISTENT_OUTPUT || live_inputs == 0 ||
         // While a full schedule recorded should have each input hit either its
         // EOF or ROI end, we have a fallback to avoid hangs for possible recorded
         // schedules that end an input early deliberately without an ROI.
@@ -4198,6 +4361,13 @@ scheduler_tmpl_t<RecordType, ReaderType>::eof_or_idle(output_ordinal_t output,
         assert(options_.mapping != MAP_AS_PREVIOUSLY || outputs_[output].at_eof);
         return sched_type_t::STATUS_EOF;
     }
+    if (options_.mapping == MAP_TO_ANY_OUTPUT &&
+        live_inputs <=
+            static_cast<int>(inputs_.size() * options_.exit_if_fraction_inputs_left)) {
+        VPRINT(this, 1, "output %d exiting early with %d live inputs left\n", output,
+               live_inputs);
+        return sched_type_t::STATUS_EOF;
+    }
     // Before going idle, try to steal work from another output.
     // We start with us+1 to avoid everyone stealing from the low-numbered outputs.
     // We only try when we first transition to idle; we rely on rebalancing after that,
@@ -4227,6 +4397,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::eof_or_idle(output_ordinal_t output,
     if (prev_input != INVALID_INPUT_ORDINAL)
         ++outputs_[output].stats[memtrace_stream_t::SCHED_STAT_SWITCH_INPUT_TO_IDLE];
     set_cur_input(output, INVALID_INPUT_ORDINAL);
+    ++outputs_[output].idle_count;
     return sched_type_t::STATUS_IDLE;
 }
 
diff --git a/clients/drcachesim/scheduler/scheduler.h b/clients/drcachesim/scheduler/scheduler.h
index 48f41280e02..e752839c1f8 100644
--- a/clients/drcachesim/scheduler/scheduler.h
+++ b/clients/drcachesim/scheduler/scheduler.h
@@ -739,8 +739,11 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
          * other parameters that are in microseconds (they all end in "_us": e.g.,
          * #quantum_duration_us) so that they operate on the right time scale for the
          * passed-in simulator time (or wall-clock microseconds if no time is passed).
+         * The default value is a rough estimate when no accurate simulated time is
+         * available: the instruction count is used in that case, and we use the
+         * instructions per microsecond for a 2GHz clock at 0.5 IPC as our default.
          */
-        double time_units_per_us = 100.;
+        double time_units_per_us = 1000.;
         /**
          * The scheduling quantum duration for preemption, in simulated microseconds,
          * for #QUANTUM_TIME.  This value is multiplied by #time_units_per_us to
@@ -805,6 +808,17 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
          * (#block_time_max_us) scaled by #block_time_multiplier.
          */
         bool honor_infinite_timeouts = false;
+        /**
+         * For #MAP_TO_ANY_OUTPUT, when an input reaches EOF, if the number of non-EOF
+         * inputs left as a fraction of the original inputs is equal to or less than
+         * this value then the scheduler exits (sets all outputs to EOF) rather than
+         * finishing off the final inputs.  This helps avoid long sequences of idles
+         * during staggered endings with fewer inputs left than cores and only a small
+         * fraction of the total instructions left in those inputs.  Since the remaining
+         * instruction count is not considered (as it is not available), use discretion
+         * when raising this value on uneven inputs.
+         */
+        double exit_if_fraction_inputs_left = 0.1;
         // When adding new options, also add to print_configuration().
     };
 
@@ -863,7 +877,8 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
         // diminished.
         /**
          * Advances to the next record in the stream.  Returns a status code on whether
-         * and how to continue.
+         * and how to continue.  Uses the instruction count plus idle count to this point
+         * as the time; use the variant that takes "cur_time" to instead provide a time.
          */
         virtual stream_status_t
         next_record(RecordType &record);
@@ -871,11 +886,12 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
         /**
          * Advances to the next record in the stream.  Returns a status code on whether
          * and how to continue.  Supplies the current time for #QUANTUM_TIME.  The time
-         * should be considered to be the time prior to processing the returned record.
-         * The time is unitless but needs to be a globally consistent increasing value
-         * across all output streams.  #STATUS_INVALID is returned if 0 or a value smaller
-         * than the start time of the current input's quantum is passed in when
-         * #QUANTUM_TIME and #MAP_TO_ANY_OUTPUT are specified.
+         * should be considered to be the simulated time prior to processing the returned
+         * record.  The time's units can be chosen by the caller, with
+         * #dynamorio::drmemtrace::scheduler_tmpl_t::scheduler_options_t.time_units_per_us
+         * providing the conversion to simulated microseconds.  #STATUS_INVALID is
+         * returned if 0 or a value smaller than the start time of the current input's
+         * quantum is passed in when #QUANTUM_TIME and #MAP_TO_ANY_OUTPUT are specified.
          */
         virtual stream_status_t
         next_record(RecordType &record, uint64_t cur_time);
@@ -985,6 +1001,15 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
                 return scheduler_->get_input_stream(ordinal_)->get_instruction_ordinal();
             return cur_instr_count_;
         }
+        /**
+         * Identical to get_instruction_ordinal() but ignores the
+         * #SCHEDULER_USE_INPUT_ORDINALS flag.
+         */
+        uint64_t
+        get_output_instruction_ordinal() const
+        {
+            return cur_instr_count_;
+        }
         /**
          * Returns a name for the current input stream feeding this output stream. For
          * stored offline traces, this is the base name of the trace on disk. For online
@@ -1434,6 +1459,9 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
             // Indicates that the output is idle.  The value.idle_duration field holds
             // a duration in microseconds.
             IDLE,
+            // Indicates that the output is idle.  The value.idle_duration field holds
+            // a duration as a count of idle records.
+            IDLE_BY_COUNT,
         };
         static constexpr int VERSION_CURRENT = 0;
         schedule_record_t() = default;
@@ -1468,6 +1496,7 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
             {
             }
             // For record_type_t::IDLE, the duration in microseconds of the idling.
+            // For record_type_t::IDLE_BY_COUNT, the duration as a count of idle records.
             uint64_t idle_duration;
             // Input stream ordinal of starting point, for non-IDLE types.
             uint64_t start_instruction = 0;
@@ -1616,6 +1645,9 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
             cur_time =
                 std::unique_ptr<std::atomic<uint64_t>>(new std::atomic<uint64_t>());
             cur_time->store(0, std::memory_order_relaxed);
+            initial_cur_time =
+                std::unique_ptr<std::atomic<uint64_t>>(new std::atomic<uint64_t>());
+            initial_cur_time->store(0, std::memory_order_relaxed);
             record_index = std::unique_ptr<std::atomic<int>>(new std::atomic<int>());
             record_index->store(0, std::memory_order_relaxed);
         }
@@ -1663,15 +1695,24 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
         // This is accessed by other outputs for stealing and rebalancing.
         // Indirected so we can store it in our vector.
         std::unique_ptr<std::atomic<uint64_t>> cur_time;
+        // The first simulation time passed to this output.
+        // This is accessed by other outputs for stealing and rebalancing.
+        // Indirected so we can store it in our vector.
+        std::unique_ptr<std::atomic<uint64_t>> initial_cur_time;
         // Used for MAP_TO_RECORDED_OUTPUT get_output_cpuid().
         int64_t as_traced_cpuid = -1;
         // Used for MAP_AS_PREVIOUSLY with live_replay_output_count_.
         bool at_eof = false;
-        // Used for replaying wait periods.
-        uint64_t wait_start_time = 0;
+        // Used for recording and replaying idle periods.
+        int64_t idle_start_count = -1;
         // Exported statistics. Currently all integers and cast to double on export.
         std::vector<int64_t> stats =
             std::vector<int64_t>(memtrace_stream_t::SCHED_STAT_TYPE_COUNT);
+        // When no simulation time is passed to us, we use the idle count plus
+        // instruction count to measure time.
+        uint64_t idle_count = 0;
+        // The first timestamp (pre-update_next_record()) seen on the first input.
+        uintptr_t base_timestamp = 0;
     };
 
     // Used for reading as-traced schedules.
@@ -1722,6 +1763,9 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
         uint64_t input_count = 0;
     };
 
+    // We assume a 2GHz clock and IPC=1.
+    static constexpr uint64_t INSTRS_PER_US = 2000;
+
     // Called just once at initialization time to set the initial input-to-output
     // mappings and state.
     scheduler_status_t
@@ -1918,6 +1962,9 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
     bool
     record_type_is_timestamp(RecordType record, uintptr_t &value);
 
+    bool
+    record_type_set_marker_value(RecordType &record, uintptr_t value);
+
     bool
     record_type_is_invalid(RecordType record);
 
@@ -1943,6 +1990,9 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
     void
     insert_switch_tid_pid(input_info_t &input);
 
+    void
+    update_next_record(output_ordinal_t output, RecordType &record);
+
     // Used for diagnostics: prints record fields to stderr.
     void
     print_record(const RecordType &record);
@@ -2021,7 +2071,10 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
     set_output_active(output_ordinal_t output, bool active);
 
     // Caller must hold the input's lock.
-    void
+    // The return value is STATUS_EOF if a global exit is now happening (an
+    // early exit); otherwise STATUS_OK is returned on success but only a
+    // local EOF.
+    stream_status_t
     mark_input_eof(input_info_t &input);
 
     // Determines whether to exit or wait for other outputs when one output
diff --git a/clients/drcachesim/simulator/cache_simulator.cpp b/clients/drcachesim/simulator/cache_simulator.cpp
index 0d5bc263007..e834b3b27ae 100644
--- a/clients/drcachesim/simulator/cache_simulator.cpp
+++ b/clients/drcachesim/simulator/cache_simulator.cpp
@@ -632,8 +632,7 @@ cache_simulator_t::print_results()
     std::cerr << "Cache simulation results:\n";
     // Print core and associated L1 cache stats first.
     for (unsigned int i = 0; i < knobs_.num_cores; i++) {
-        print_core(i);
-        if (shard_type_ == SHARD_BY_CORE || thread_ever_counts_[i] > 0) {
+        if (print_core(i)) {
             if (l1_icaches_[i] != l1_dcaches_[i]) {
                 std::cerr << "  " << l1_icaches_[i]->get_name() << " ("
                           << l1_icaches_[i]->get_description() << ") stats:" << std::endl;
diff --git a/clients/drcachesim/simulator/caching_device.cpp b/clients/drcachesim/simulator/caching_device.cpp
index 0126e0b8203..08482bd0f0b 100644
--- a/clients/drcachesim/simulator/caching_device.cpp
+++ b/clients/drcachesim/simulator/caching_device.cpp
@@ -240,8 +240,8 @@ caching_device_t::request(const memref_t &memref_in)
 
         // Issue a hardware prefetch, if any, before we remember the last tag,
         // so we remember this line and not the prefetched line.
-        if (missed && !type_is_prefetch(memref.data.type) && prefetcher_ != nullptr)
-            prefetcher_->prefetch(this, memref);
+        if (prefetcher_ != nullptr && !type_is_prefetch(memref_in.data.type))
+            prefetcher_->prefetch(this, memref, missed);
 
         if (tag + 1 <= final_tag) {
             addr_t next_addr = (tag + 1) << block_size_bits_;
diff --git a/clients/drcachesim/simulator/prefetcher.cpp b/clients/drcachesim/simulator/prefetcher.cpp
index f9ffa9950ca..04f9b08378a 100644
--- a/clients/drcachesim/simulator/prefetcher.cpp
+++ b/clients/drcachesim/simulator/prefetcher.cpp
@@ -48,13 +48,16 @@ prefetcher_t::prefetcher_t(int block_size)
 }
 
 void
-prefetcher_t::prefetch(caching_device_t *cache, const memref_t &memref_in)
+prefetcher_t::prefetch(caching_device_t *cache, const memref_t &memref_in,
+                       const bool missed)
 {
     // We implement a simple next-line prefetcher.
-    memref_t memref = memref_in;
-    memref.data.addr += block_size_;
-    memref.data.type = TRACE_TYPE_HARDWARE_PREFETCH;
-    cache->request(memref);
+    if (missed) {
+        memref_t memref = memref_in;
+        memref.data.addr += block_size_;
+        memref.data.type = TRACE_TYPE_HARDWARE_PREFETCH;
+        cache->request(memref);
+    }
 }
 } // namespace drmemtrace
 } // namespace dynamorio
diff --git a/clients/drcachesim/simulator/prefetcher.h b/clients/drcachesim/simulator/prefetcher.h
index aa960f7b7d5..5729ae2c38a 100644
--- a/clients/drcachesim/simulator/prefetcher.h
+++ b/clients/drcachesim/simulator/prefetcher.h
@@ -50,8 +50,11 @@ class prefetcher_t {
     virtual ~prefetcher_t()
     {
     }
+    // prefetch() will be called for all demand accesses, even those that
+    // hit in the cache. The missed parameter indicates whether the
+    // memref.data.addr is already in the cache or not.
     virtual void
-    prefetch(caching_device_t *cache, const memref_t &memref);
+    prefetch(caching_device_t *cache, const memref_t &memref, bool missed);
 
 protected:
     int block_size_;
diff --git a/clients/drcachesim/simulator/simulator.cpp b/clients/drcachesim/simulator/simulator.cpp
index 9199de6bad3..5c61006a7b3 100644
--- a/clients/drcachesim/simulator/simulator.cpp
+++ b/clients/drcachesim/simulator/simulator.cpp
@@ -311,18 +311,19 @@ simulator_t::handle_thread_exit(memref_tid_t tid)
     thread2core_.erase(tid);
 }
 
-void
+bool
 simulator_t::print_core(int core) const
 {
     if (!knob_cpu_scheduling_ && shard_type_ == SHARD_BY_THREAD) {
         std::cerr << "Core #" << core << " (" << thread_ever_counts_[core]
                   << " thread(s))" << std::endl;
+        return thread_ever_counts_[core] > 0;
     } else {
         std::cerr << "Core #" << core;
         if (shard_type_ == SHARD_BY_THREAD && cpu_counts_[core] == 0) {
             // We keep the "(s)" mainly to simplify test templates.
             std::cerr << " (0 traced CPU(s))" << std::endl;
-            return;
+            return false;
         }
         std::cerr << " (";
         if (shard_type_ == SHARD_BY_THREAD) // Always 1:1 for SHARD_BY_CORE.
@@ -338,6 +339,8 @@ simulator_t::print_core(int core) const
             }
         }
         std::cerr << ")" << std::endl;
+        // If anything ran on this core, need_comma will be true.
+        return need_comma;
     }
 }
 
diff --git a/clients/drcachesim/simulator/simulator.h b/clients/drcachesim/simulator/simulator.h
index 78e434a6517..c3359e265da 100644
--- a/clients/drcachesim/simulator/simulator.h
+++ b/clients/drcachesim/simulator/simulator.h
@@ -69,6 +69,13 @@ class simulator_t : public analysis_tool_t {
     std::string
     initialize_shard_type(shard_type_t shard_type) override;
 
+    shard_type_t
+    preferred_shard_type() override
+    {
+        // We prefer a dynamic schedule with more realistic thread interleavings.
+        return SHARD_BY_CORE;
+    }
+
     bool
     process_memref(const memref_t &memref) override;
 
@@ -83,7 +90,8 @@ class simulator_t : public analysis_tool_t {
                double warmup_fraction, uint64_t sim_refs, bool cpu_scheduling,
                bool use_physical, unsigned int verbose);
 
-    void
+    // Returns whether the core was ever non-empty.
+    bool
     print_core(int core) const;
 
     int
diff --git a/clients/drcachesim/simulator/tlb_simulator.cpp b/clients/drcachesim/simulator/tlb_simulator.cpp
index f5ac4ff9e1b..396a9c9cab7 100644
--- a/clients/drcachesim/simulator/tlb_simulator.cpp
+++ b/clients/drcachesim/simulator/tlb_simulator.cpp
@@ -264,8 +264,7 @@ tlb_simulator_t::print_results()
 {
     std::cerr << "TLB simulation results:\n";
     for (unsigned int i = 0; i < knobs_.num_cores; i++) {
-        print_core(i);
-        if (thread_ever_counts_[i] > 0) {
+        if (print_core(i)) {
             std::cerr << "  L1I stats:" << std::endl;
             itlbs_[i]->get_stats()->print_stats("    ");
             std::cerr << "  L1D stats:" << std::endl;
diff --git a/clients/drcachesim/tests/burst_syscall_pt.cpp b/clients/drcachesim/tests/burst_syscall_pt.cpp
new file mode 100644
index 00000000000..f58dccaf9e1
--- /dev/null
+++ b/clients/drcachesim/tests/burst_syscall_pt.cpp
@@ -0,0 +1,347 @@
+/* **********************************************************
+ * Copyright (c) 2016-2024 Google, Inc.  All rights reserved.
+ * **********************************************************/
+
+/*
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * * Neither the name of Google, Inc. nor the names of its contributors may be
+ *   used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL GOOGLE, INC. OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+// This application links in drmemtrace_static and acquires a trace during
+// a "burst" of execution that includes some system call traces collected
+// using Intel-PT.
+
+// This is set globally in CMake for other tests so easier to undef here.
+#undef DR_REG_ENUM_COMPATIBILITY
+
+#include "analyzer.h"
+#include "tools/basic_counts.h"
+#include "dr_api.h"
+#include "drmemtrace/drmemtrace.h"
+#include "drmemtrace/raw2trace.h"
+#include "mock_reader.h"
+#include "raw2trace_directory.h"
+#include "scheduler.h"
+
+#include <cassert>
+#include <unistd.h>
+#include <fstream>
+#include <iostream>
+#include <linux/futex.h>
+#include <pthread.h>
+#include <string>
+#include <sys/syscall.h>
+
+namespace dynamorio {
+namespace drmemtrace {
+
+#define FATAL_ERROR(msg, ...)                               \
+    do {                                                    \
+        fprintf(stderr, "ERROR: " msg "\n", ##__VA_ARGS__); \
+        fflush(stderr);                                     \
+        exit(1);                                            \
+    } while (0)
+
+/* The futex the child waits at initially. */
+static uint32_t futex_var = 0xf00d;
+/* The futex the child is transferred to. */
+static uint32_t futex_var_other = 0x8bad;
+
+static void *
+child_futex_wait(void *)
+{
+    long res = syscall(SYS_futex, &futex_var, FUTEX_WAIT, /*#val=*/0xf00d,
+                       /*timeout=*/nullptr, /*uaddr2=*/nullptr, /*val3=*/0);
+    assert(res == 0);
+    std::cerr << "Child released from futex\n";
+    return NULL;
+}
+
+static void
+parent_futex_wake()
+{
+    /* The child would be waiting at the other futex by now.
+     * i#7034: Note that the child thread undergoes detach while it is waiting
+     * on futex_var_other. There is a bug at this point due to a possible
+     * transparency violation in DR. When the child thread restarts futex after
+     * being interrupted by DR's detach signal, it is found to resume
+     * waiting at the original futex_var instead of futex_var_other.
+     * If we modify this app to do detach after parent_futex_wake returns, then
+     * the child is found to be waiting at futex_var_other as expected.
+     */
+    uint32_t *child_waiting_at_futex = &futex_var;
+    long res = syscall(SYS_futex, child_waiting_at_futex, FUTEX_WAKE, /*#wakeup=*/1,
+                       /*timeout=*/nullptr, /*uaddr2=*/nullptr, /*val3=*/0);
+    assert(res == 1);
+}
+
+static void
+parent_futex_reque()
+{
+    long res;
+    do {
+        /* Repeat until the child is surely waiting at the futex. We'll know this
+         * when the following call returns a 1, which means the child was
+         * transferred to futex_var_other. This is to ensure that the child thread
+         * is inside the futex syscall when DR detaches.
+         */
+        res = syscall(SYS_futex, &futex_var, FUTEX_CMP_REQUEUE, /*#wakeup_max=*/0,
+                      /*#requeue_max=*/1, /*uaddr2=*/&futex_var_other, /*val3=*/0xf00d);
+        assert(res == 0 || res == 1);
+    } while (res == 0);
+}
+
+static int
+do_some_syscalls()
+{
+    getpid();
+    gettid();
+    return 1;
+}
+
+static std::string
+postprocess(void *dr_context)
+{
+    std::cerr << "Post-processing the trace\n";
+    // Get path to write the final trace to.
+    const char *raw_dir;
+    drmemtrace_status_t mem_res = drmemtrace_get_output_path(&raw_dir);
+    assert(mem_res == DRMEMTRACE_SUCCESS);
+    std::string outdir = std::string(raw_dir) + DIRSEP + "post_processed";
+
+    const char *kcore_path;
+    drmemtrace_status_t kcore_res = drmemtrace_get_kcore_path(&kcore_path);
+    assert(kcore_res == DRMEMTRACE_SUCCESS);
+
+    raw2trace_directory_t dir;
+    if (!dr_create_dir(outdir.c_str()))
+        FATAL_ERROR("Failed to create output dir.");
+    std::string dir_err = dir.initialize(raw_dir, outdir, DEFAULT_TRACE_COMPRESSION_TYPE,
+                                         /*syscall_template_file=*/"");
+    assert(dir_err.empty());
+    raw2trace_t raw2trace(dir.modfile_bytes_, dir.in_files_, dir.out_files_,
+                          dir.out_archives_, dir.encoding_file_,
+                          dir.serial_schedule_file_, dir.cpu_schedule_file_, dr_context,
+                          /*verbosity=*/0, /*worker_count=*/-1,
+                          /*alt_module_dir=*/"",
+                          /*chunk_instr_count=*/10 * 1000 * 1000, dir.in_kfiles_map_,
+                          dir.kcoredir_, /*kallsyms_path=*/"",
+                          /*syscall_template_file=*/nullptr,
+                          // We want to fail if any error is encountered.
+                          /*pt2ir_best_effort=*/false);
+    std::string error = raw2trace.do_conversion();
+    if (!error.empty())
+        FATAL_ERROR("raw2trace failed: %s\n", error.c_str());
+    uint64 decoded_syscall_count =
+        raw2trace.get_statistic(RAW2TRACE_STAT_SYSCALL_TRACES_CONVERTED);
+    // We should see atleast the getpid, gettid, and futex syscalls made by the parent.
+    if (decoded_syscall_count <= 2) {
+        std::cerr << "Incorrect decoded syscall count (found: " << decoded_syscall_count
+                  << " vs expected > 2)\n";
+    }
+    return outdir;
+}
+
+// Trace analysis tool that allows us to verify properties of the generated PT trace.
+class pt_analysis_tool_t : public analysis_tool_t {
+public:
+    pt_analysis_tool_t()
+    {
+    }
+    bool
+    process_memref(const memref_t &memref) override
+    {
+        FATAL_ERROR("Expected to use sharded mode");
+        return true;
+    }
+    bool
+    parallel_shard_supported() override
+    {
+        return true;
+    }
+    void *
+    parallel_shard_init(int shard_index, void *worker_data) override
+    {
+        auto per_shard = new per_shard_t;
+        return reinterpret_cast<void *>(per_shard);
+    }
+    bool
+    parallel_shard_exit(void *shard_data) override
+    {
+        std::lock_guard<std::mutex> guard(shard_exit_mutex_);
+        per_shard_t *shard = reinterpret_cast<per_shard_t *>(shard_data);
+        if (shard->syscall_count == 0)
+            return true;
+        // In case the child has just the one futex syscall which was skipped
+        // from the trace.
+        if (shard->syscall_count > 1 && !shard->any_syscall_had_trace) {
+            std::cerr << "No syscall had a trace\n";
+        }
+        if (shard->prev_was_futex_marker && !shard->prev_syscall_had_trace) {
+            found_final_futex_without_trace_ = true;
+        }
+        if (shard->kernel_instr_count > 0) {
+            found_some_kernel_instrs_ = true;
+        }
+        return true;
+    }
+    bool
+    parallel_shard_memref(void *shard_data, const memref_t &memref) override
+    {
+        per_shard_t *shard = reinterpret_cast<per_shard_t *>(shard_data);
+        if (memref.marker.type == TRACE_TYPE_MARKER) {
+            switch (memref.marker.marker_type) {
+            case TRACE_MARKER_TYPE_SYSCALL_TRACE_START:
+                shard->in_syscall_trace = true;
+                break;
+            case TRACE_MARKER_TYPE_SYSCALL_TRACE_END:
+                shard->in_syscall_trace = false;
+                shard->prev_syscall_had_trace = true;
+                shard->any_syscall_had_trace = true;
+                break;
+            case TRACE_MARKER_TYPE_SYSCALL:
+                ++shard->syscall_count;
+                shard->prev_syscall_had_trace = false;
+                if (memref.marker.marker_value == SYS_futex) {
+                    shard->prev_was_futex_marker = true;
+                }
+                break;
+            }
+        }
+        if (!type_is_instr(memref.data.type))
+            return true;
+        if (shard->in_syscall_trace) {
+            ++shard->kernel_instr_count;
+            return true;
+        }
+        shard->prev_was_futex_marker = false;
+        shard->prev_syscall_had_trace = false;
+        return true;
+    }
+    bool
+    print_results() override
+    {
+        if (!found_final_futex_without_trace_) {
+            std::cerr
+                << "Did not find any thread trace with final futex without PT trace\n";
+        } else {
+            std::cerr << "Found matching signature in a thread\n";
+        }
+        if (!found_some_kernel_instrs_) {
+            std::cerr << "Did not find any kernel instrs\n";
+        }
+        return true;
+    }
+
+private:
+    // Data tracked per shard.
+    struct per_shard_t {
+        bool prev_was_futex_marker = false;
+        bool prev_syscall_had_trace = false;
+        bool any_syscall_had_trace = false;
+        int syscall_count = 0;
+        bool in_syscall_trace = false;
+        int kernel_instr_count = 0;
+    };
+
+    bool found_final_futex_without_trace_ = false;
+    bool found_some_kernel_instrs_ = false;
+    std::mutex shard_exit_mutex_;
+};
+
+static bool
+run_pt_analysis(const std::string &trace_dir)
+{
+    auto pt_analysis_tool = std::unique_ptr<pt_analysis_tool_t>(new pt_analysis_tool_t());
+    std::vector<analysis_tool_t *> tools;
+    tools.push_back(pt_analysis_tool.get());
+    analyzer_t analyzer(trace_dir, &tools[0], static_cast<int>(tools.size()));
+    if (!analyzer) {
+        FATAL_ERROR("failed to initialize analyzer: %s",
+                    analyzer.get_error_string().c_str());
+    }
+    if (!analyzer.run()) {
+        FATAL_ERROR("failed to run analyzer: %s", analyzer.get_error_string().c_str());
+    }
+    if (!analyzer.print_stats()) {
+        FATAL_ERROR("failed to print stats: %s", analyzer.get_error_string().c_str());
+    }
+    return true;
+}
+
+static void
+gather_trace()
+{
+    if (setenv("DYNAMORIO_OPTIONS",
+               "-stderr_mask 0xc -client_lib ';;-offline -enable_kernel_tracing",
+               1 /*override*/) != 0)
+        std::cerr << "failed to set env var!\n";
+    dr_app_setup();
+    assert(!dr_app_running_under_dynamorio());
+    dr_app_start();
+
+    pthread_t child_thread;
+    int res = pthread_create(&child_thread, NULL, child_futex_wait, NULL);
+    assert(res == 0);
+
+    /* Ensure that the child is waiting at a futex. */
+    parent_futex_reque();
+
+    do_some_syscalls();
+
+    dr_app_stop_and_cleanup();
+
+    /* Wake up the child finally. */
+    parent_futex_wake();
+
+    pthread_join(child_thread, NULL);
+
+    return;
+}
+
+static int
+test_pt_trace(void *dr_context)
+{
+    std::string trace_dir = postprocess(dr_context);
+    if (!run_pt_analysis(trace_dir))
+        return 1;
+    return 0;
+}
+
+int
+test_main(int argc, const char *argv[])
+{
+    gather_trace();
+    void *dr_context = dr_standalone_init();
+    if (test_pt_trace(dr_context)) {
+        return 1;
+    }
+    dr_standalone_exit();
+    return 0;
+}
+
+} // namespace drmemtrace
+} // namespace dynamorio
diff --git a/clients/drcachesim/tests/drcachesim_unit_tests.cpp b/clients/drcachesim/tests/drcachesim_unit_tests.cpp
index eae89fa0218..a0b8878f1cd 100644
--- a/clients/drcachesim/tests/drcachesim_unit_tests.cpp
+++ b/clients/drcachesim/tests/drcachesim_unit_tests.cpp
@@ -313,13 +313,6 @@ unit_test_nextline_prefetcher()
 
 class next2line_prefetcher_factory_t : public prefetcher_factory_t {
 public:
-    prefetcher_t *
-    create_prefetcher(int block_size) override
-    {
-        return new next2line_prefetcher_t(block_size);
-    }
-
-private:
     class next2line_prefetcher_t : public prefetcher_t {
     public:
         next2line_prefetcher_t(int block_size)
@@ -327,17 +320,33 @@ class next2line_prefetcher_factory_t : public prefetcher_factory_t {
         {
         }
         void
-        prefetch(caching_device_t *cache, const memref_t &memref_in)
+        prefetch(caching_device_t *cache, const memref_t &memref_in, const bool missed)
         {
             // We implement a simple 2 next-line prefetcher.
-            memref_t memref = memref_in;
-            memref.data.addr += block_size_;
-            memref.data.type = TRACE_TYPE_HARDWARE_PREFETCH;
-            cache->request(memref);
-            memref.data.addr += block_size_;
-            cache->request(memref);
+            // We also track whether inputs are hits or misses for testing.
+            if (missed) {
+                misses_++;
+                memref_t memref = memref_in;
+                memref.data.addr += block_size_;
+                memref.data.type = TRACE_TYPE_HARDWARE_PREFETCH;
+                cache->request(memref);
+                memref.data.addr += block_size_;
+                cache->request(memref);
+            } else {
+                hits_++;
+            }
         }
+        int hits_ = 0;
+        int misses_ = 0;
     };
+
+    next2line_prefetcher_t *
+    create_prefetcher(int block_size) override
+    {
+        prefetcher_ = new next2line_prefetcher_t(block_size);
+        return prefetcher_;
+    }
+    next2line_prefetcher_t *prefetcher_;
 };
 
 void
@@ -361,6 +370,8 @@ unit_test_custom_prefetcher()
 
     assert(nextline_cache_sim.get_cache_metric(metric_name_t::MISSES, 1, 0) ==
            EXPECTED_MISSES_NEXT2LINE_PREFETCHER);
+    assert(next2line_prefetcher_factory.prefetcher_->hits_ == 4);
+    assert(next2line_prefetcher_factory.prefetcher_->misses_ == 2);
 }
 void
 unit_test_child_hits()
diff --git a/clients/drcachesim/tests/offline-burst_client.templatex b/clients/drcachesim/tests/offline-burst_client.templatex
index 3bc0005808f..cd0fb000c19 100644
--- a/clients/drcachesim/tests/offline-burst_client.templatex
+++ b/clients/drcachesim/tests/offline-burst_client.templatex
@@ -23,7 +23,7 @@ DynamoRIO statistics:
 .*
 all done
 Cache simulation results:
-Core #0 \(1 thread\(s\)\)
+Core #0 \(traced CPU\(s\): #0\)
   L1I0 .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
@@ -36,9 +36,9 @@ Core #0 \(1 thread\(s\)\)
     Compulsory misses:            *[0-9,\.]*
     Invalidations:                *0
 .*   Miss rate:                        [0-3][,\.]..%
-Core #1 \(0 thread\(s\)\)
-Core #2 \(0 thread\(s\)\)
-Core #3 \(0 thread\(s\)\)
+Core #1 \(traced CPU\(s\): \)
+Core #2 \(traced CPU\(s\): \)
+Core #3 \(traced CPU\(s\): \)
 LL .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
diff --git a/clients/drcachesim/tests/offline-burst_maps.templatex b/clients/drcachesim/tests/offline-burst_maps.templatex
index 50ddb5c76bb..a851855f7c3 100644
--- a/clients/drcachesim/tests/offline-burst_maps.templatex
+++ b/clients/drcachesim/tests/offline-burst_maps.templatex
@@ -11,7 +11,7 @@ pre-DR start
 pre-DR detach
 all done
 Cache simulation results:
-Core #0 \(1 thread\(s\)\)
+Core #0 \(traced CPU\(s\): #0\)
   L1I0 .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
@@ -24,9 +24,9 @@ Core #0 \(1 thread\(s\)\)
     Compulsory misses:            *[0-9,\.]*
     Invalidations:                *0
 .*   Miss rate:                        [0-3][,\.]..%
-Core #1 \(0 thread\(s\)\)
-Core #2 \(0 thread\(s\)\)
-Core #3 \(0 thread\(s\)\)
+Core #1 \(traced CPU\(s\): \)
+Core #2 \(traced CPU\(s\): \)
+Core #3 \(traced CPU\(s\): \)
 LL .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
diff --git a/clients/drcachesim/tests/offline-burst_noreach.templatex b/clients/drcachesim/tests/offline-burst_noreach.templatex
index 50ddb5c76bb..a851855f7c3 100644
--- a/clients/drcachesim/tests/offline-burst_noreach.templatex
+++ b/clients/drcachesim/tests/offline-burst_noreach.templatex
@@ -11,7 +11,7 @@ pre-DR start
 pre-DR detach
 all done
 Cache simulation results:
-Core #0 \(1 thread\(s\)\)
+Core #0 \(traced CPU\(s\): #0\)
   L1I0 .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
@@ -24,9 +24,9 @@ Core #0 \(1 thread\(s\)\)
     Compulsory misses:            *[0-9,\.]*
     Invalidations:                *0
 .*   Miss rate:                        [0-3][,\.]..%
-Core #1 \(0 thread\(s\)\)
-Core #2 \(0 thread\(s\)\)
-Core #3 \(0 thread\(s\)\)
+Core #1 \(traced CPU\(s\): \)
+Core #2 \(traced CPU\(s\): \)
+Core #3 \(traced CPU\(s\): \)
 LL .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
diff --git a/clients/drcachesim/tests/offline-burst_replace.templatex b/clients/drcachesim/tests/offline-burst_replace.templatex
index 6ef8c7388c0..3b6bc0c5ac3 100644
--- a/clients/drcachesim/tests/offline-burst_replace.templatex
+++ b/clients/drcachesim/tests/offline-burst_replace.templatex
@@ -19,7 +19,7 @@ close file .*
 close file .*
 all done
 Cache simulation results:
-Core #0 \(1 thread\(s\)\)
+Core #0 \(traced CPU\(s\): #0\)
   L1I0 .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
@@ -32,9 +32,9 @@ Core #0 \(1 thread\(s\)\)
     Compulsory misses:            *[0-9,\.]*
     Invalidations:                *0
 .*   Miss rate:                        [0-3][,\.]..%
-Core #1 \(0 thread\(s\)\)
-Core #2 \(0 thread\(s\)\)
-Core #3 \(0 thread\(s\)\)
+Core #1 \(traced CPU\(s\): \)
+Core #2 \(traced CPU\(s\): \)
+Core #3 \(traced CPU\(s\): \)
 LL .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
diff --git a/clients/drcachesim/tests/offline-burst_static.templatex b/clients/drcachesim/tests/offline-burst_static.templatex
index 1e1446cce3b..5fcc4fa8959 100644
--- a/clients/drcachesim/tests/offline-burst_static.templatex
+++ b/clients/drcachesim/tests/offline-burst_static.templatex
@@ -20,7 +20,7 @@ DynamoRIO statistics:
 .*
 all done
 Cache simulation results:
-Core #0 \(1 thread\(s\)\)
+Core #0 \(traced CPU\(s\): #0\)
   L1I0 .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
@@ -33,9 +33,9 @@ Core #0 \(1 thread\(s\)\)
     Compulsory misses:            *[0-9,\.]*
     Invalidations:                *0
 .*   Miss rate:                        [0-3][,\.]..%
-Core #1 \(0 thread\(s\)\)
-Core #2 \(0 thread\(s\)\)
-Core #3 \(0 thread\(s\)\)
+Core #1 \(traced CPU\(s\): \)
+Core #2 \(traced CPU\(s\): \)
+Core #3 \(traced CPU\(s\): \)
 LL .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
diff --git a/clients/drcachesim/tests/offline-burst_syscall_pt_SUDO.templatex b/clients/drcachesim/tests/offline-burst_syscall_pt_SUDO.templatex
new file mode 100644
index 00000000000..840000756c6
--- /dev/null
+++ b/clients/drcachesim/tests/offline-burst_syscall_pt_SUDO.templatex
@@ -0,0 +1,9 @@
+ERROR: PT tracing for the last syscall .* of thread .* was found active at detach.
+Child released from futex
+Post-processing the trace
+Found matching signature in a thread
+Syscall mix tool results:
+  syscall count : syscall_num
+.*
+ syscall trace count : syscall_num
+.*
diff --git a/clients/drcachesim/tests/offline-filter-and-instr-only-trace.templatex b/clients/drcachesim/tests/offline-filter-and-instr-only-trace.templatex
index 0c3712398cd..4ef96d3d3e9 100644
--- a/clients/drcachesim/tests/offline-filter-and-instr-only-trace.templatex
+++ b/clients/drcachesim/tests/offline-filter-and-instr-only-trace.templatex
@@ -1,6 +1,6 @@
 Hello, world!
 Cache simulation results:
-Core #0 \(1 thread\(s\)\)
+Core #0 \(traced CPU\(s\): #0\)
   L1I0 .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
@@ -12,9 +12,9 @@ Core #0 \(1 thread\(s\)\)
     Misses:                              0
     Compulsory misses:                   *[0-9,\.]*
     Invalidations:                       0
-Core #1 \(0 thread\(s\)\)
-Core #2 \(0 thread\(s\)\)
-Core #3 \(0 thread\(s\)\)
+Core #1 \(traced CPU\(s\): \)
+Core #2 \(traced CPU\(s\): \)
+Core #3 \(traced CPU\(s\): \)
 LL .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
diff --git a/clients/drcachesim/tests/offline-filter.templatex b/clients/drcachesim/tests/offline-filter.templatex
index 326b785b35a..7db8cbdfaa1 100644
--- a/clients/drcachesim/tests/offline-filter.templatex
+++ b/clients/drcachesim/tests/offline-filter.templatex
@@ -1,6 +1,6 @@
 Hello, world!
 Cache simulation results:
-Core #0 \(1 thread\(s\)\)
+Core #0 \(traced CPU\(s\): #0\)
   L1I0 .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
@@ -13,9 +13,9 @@ Core #0 \(1 thread\(s\)\)
     Compulsory misses:            *[0-9,\.]*
     Invalidations:                *0
 .*   Miss rate:                   *[0-9]?[0-9][,\.]..%
-Core #1 \(0 thread\(s\)\)
-Core #2 \(0 thread\(s\)\)
-Core #3 \(0 thread\(s\)\)
+Core #1 \(traced CPU\(s\): \)
+Core #2 \(traced CPU\(s\): \)
+Core #3 \(traced CPU\(s\): \)
 LL .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
diff --git a/clients/drcachesim/tests/offline-fork.templatex b/clients/drcachesim/tests/offline-fork.templatex
index d4a5a4b2402..f8991518514 100644
--- a/clients/drcachesim/tests/offline-fork.templatex
+++ b/clients/drcachesim/tests/offline-fork.templatex
@@ -3,7 +3,7 @@ parent waiting for child
 child is running under DynamoRIO
 child has exited
 Cache simulation results:
-Core #0 \(1 thread\(s\)\)
+Core #0 \(traced CPU\(s\): #0\)
   L1I0 .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
@@ -16,9 +16,9 @@ Core #0 \(1 thread\(s\)\)
     Compulsory misses:            *[0-9,\.]*
     Invalidations:                *0
 .*   Miss rate:                        [0-9][,\.]..%
-Core #1 \(0 thread\(s\)\)
-Core #2 \(0 thread\(s\)\)
-Core #3 \(0 thread\(s\)\)
+Core #1 \(traced CPU\(s\): \)
+Core #2 \(traced CPU\(s\): \)
+Core #3 \(traced CPU\(s\): \)
 LL .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
diff --git a/clients/drcachesim/tests/offline-instr-only-trace.templatex b/clients/drcachesim/tests/offline-instr-only-trace.templatex
index 0c3712398cd..4ef96d3d3e9 100644
--- a/clients/drcachesim/tests/offline-instr-only-trace.templatex
+++ b/clients/drcachesim/tests/offline-instr-only-trace.templatex
@@ -1,6 +1,6 @@
 Hello, world!
 Cache simulation results:
-Core #0 \(1 thread\(s\)\)
+Core #0 \(traced CPU\(s\): #0\)
   L1I0 .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
@@ -12,9 +12,9 @@ Core #0 \(1 thread\(s\)\)
     Misses:                              0
     Compulsory misses:                   *[0-9,\.]*
     Invalidations:                       0
-Core #1 \(0 thread\(s\)\)
-Core #2 \(0 thread\(s\)\)
-Core #3 \(0 thread\(s\)\)
+Core #1 \(traced CPU\(s\): \)
+Core #2 \(traced CPU\(s\): \)
+Core #3 \(traced CPU\(s\): \)
 LL .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
diff --git a/clients/drcachesim/tests/offline-multiproc.templatex b/clients/drcachesim/tests/offline-multiproc.templatex
index 76b03014e8e..1555a744eaa 100644
--- a/clients/drcachesim/tests/offline-multiproc.templatex
+++ b/clients/drcachesim/tests/offline-multiproc.templatex
@@ -1,6 +1,6 @@
 all done
 Cache simulation results:
-Core #0 \(1 thread\(s\)\)
+Core #0 \(traced CPU\(s\): #0\)
   L1I0 .* stats:
     Hits:                    *[0-9\.,]*
     Misses:                  *[0-9,\.]*
@@ -13,9 +13,9 @@ Core #0 \(1 thread\(s\)\)
     Compulsory misses:       *[0-9\.,]*
     Invalidations:           *0
 .*   Miss rate:              *[0-9]*[,\.]..%
-Core #1 \(0 thread\(s\)\)
-Core #2 \(0 thread\(s\)\)
-Core #3 \(0 thread\(s\)\)
+Core #1 \(traced CPU\(s\): \)
+Core #2 \(traced CPU\(s\): \)
+Core #3 \(traced CPU\(s\): \)
 LL .* stats:
     Hits:                    *[0-9\.,]*
     Misses:                  *[0-9\.,]*
diff --git a/clients/drcachesim/tests/offline-simple.templatex b/clients/drcachesim/tests/offline-simple.templatex
index c35d941b14b..3f1e8897be4 100644
--- a/clients/drcachesim/tests/offline-simple.templatex
+++ b/clients/drcachesim/tests/offline-simple.templatex
@@ -1,6 +1,6 @@
 Hello, world!
 Cache simulation results:
-Core #0 \(1 thread\(s\)\)
+Core #0 \(traced CPU\(s\): #0\)
   L1I0 .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
@@ -13,9 +13,9 @@ Core #0 \(1 thread\(s\)\)
     Compulsory misses:            *[0-9,\.]*
     Invalidations:                *0
 .*   Miss rate:                        [0-9][,\.]..%
-Core #1 \(0 thread\(s\)\)
-Core #2 \(0 thread\(s\)\)
-Core #3 \(0 thread\(s\)\)
+Core #1 \(traced CPU\(s\): \)
+Core #2 \(traced CPU\(s\): \)
+Core #3 \(traced CPU\(s\): \)
 LL .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
diff --git a/clients/drcachesim/tests/record_filter_modify_marker_value.templatex b/clients/drcachesim/tests/record_filter_modify_marker_value.templatex
new file mode 100644
index 00000000000..e64c4881cc2
--- /dev/null
+++ b/clients/drcachesim/tests/record_filter_modify_marker_value.templatex
@@ -0,0 +1,20 @@
+Estimation of pi is 3.142425985001098
+
+Trace invariant checks passed
+
+Output .* entries from .* entries.
+
+Output format:
+
+<--record#-> <--instr#->: <---tid---> <record details>
+
+------------------------------------------------------------
+
+           1           0: +[0-9]+ <marker: version [0-9]>
+           2           0: +[0-9]+ <marker: filetype 0x[0-9a-f]*>
+           3           0: +[0-9]+ <marker: cache line size [0-9]*>
+           4           0: +[0-9]+ <marker: chunk instruction count [0-9]*>
+           5           0: +[0-9]+ <marker: page size 2048>
+           6           0: +[0-9]+ <marker: timestamp [0-9]*>
+           7           0: +[0-9]+ <marker: tid [0-9]* on core unknown>
+.*
diff --git a/clients/drcachesim/tests/record_filter_unit_tests.cpp b/clients/drcachesim/tests/record_filter_unit_tests.cpp
index fd6d6ccf0cf..7bdab2bc5ad 100644
--- a/clients/drcachesim/tests/record_filter_unit_tests.cpp
+++ b/clients/drcachesim/tests/record_filter_unit_tests.cpp
@@ -44,6 +44,7 @@
 #include "tools/filter/type_filter.h"
 #include "tools/filter/encodings2regdeps_filter.h"
 #include "tools/filter/func_id_filter.h"
+#include "tools/filter/modify_marker_value_filter.h"
 #include "trace_entry.h"
 #include "zipfile_ostream.h"
 
@@ -600,6 +601,80 @@ test_func_id_filter()
     return true;
 }
 
+static bool
+test_modify_marker_value_filter()
+{
+    constexpr addr_t PC = 0x7f6fdd3ec360;
+    constexpr addr_t ENCODING = 0xe78948;
+    constexpr uint64_t NEW_PAGE_SIZE_MARKER_VALUE = 0x800; // 2k pages.
+    std::vector<test_case_t> entries = {
+        // Trace shard header.
+        { { TRACE_TYPE_HEADER, 0, { 0x1 } }, true, { true } },
+        { { TRACE_TYPE_MARKER, TRACE_MARKER_TYPE_VERSION, { 0x2 } }, true, { true } },
+        { { TRACE_TYPE_MARKER,
+            TRACE_MARKER_TYPE_FILETYPE,
+            { OFFLINE_FILE_TYPE_ARCH_X86_64 | OFFLINE_FILE_TYPE_ENCODINGS |
+              OFFLINE_FILE_TYPE_SYSCALL_NUMBERS | OFFLINE_FILE_TYPE_BLOCKING_SYSCALLS } },
+          true,
+          { true } },
+        { { TRACE_TYPE_THREAD, 0, { 0x4 } }, true, { true } },
+        { { TRACE_TYPE_PID, 0, { 0x5 } }, true, { true } },
+        { { TRACE_TYPE_MARKER, TRACE_MARKER_TYPE_CACHE_LINE_SIZE, { 0x6 } },
+          true,
+          { true } },
+        { { TRACE_TYPE_MARKER, TRACE_MARKER_TYPE_PAGE_SIZE, { 0x1000 } }, // 4k pages.
+          true,
+          { false } },
+        // Overwrite the value of TRACE_MARKER_TYPE_PAGE_SIZE with 0x800 == 2048 == 2k
+        // page size.
+        { { TRACE_TYPE_MARKER,
+            TRACE_MARKER_TYPE_PAGE_SIZE,
+            { NEW_PAGE_SIZE_MARKER_VALUE } },
+          false,
+          { true } },
+        { { TRACE_TYPE_MARKER, TRACE_MARKER_TYPE_TIMESTAMP, { 0x7 } }, true, { true } },
+        { { TRACE_TYPE_MARKER, TRACE_MARKER_TYPE_CPU_ID, { 0x8 } }, true, { false } },
+        // Overwrite the value of TRACE_MARKER_TYPE_CPU_ID with ((uintptr_t)-1).
+        { { TRACE_TYPE_MARKER, TRACE_MARKER_TYPE_CPU_ID, { INVALID_CPU_MARKER_VALUE } },
+          false,
+          { true } },
+        // We need at least one instruction with encodings to make record_filter output
+        // the trace.
+        { { TRACE_TYPE_ENCODING, 3, { ENCODING } }, true, { true } },
+        { { TRACE_TYPE_INSTR, 3, { PC } }, true, { true } },
+
+        { { TRACE_TYPE_FOOTER, 0, { 0x0 } }, true, { true } },
+    };
+
+    // Construct modify_marker_value_filter_t. We change TRACE_MARKER_TYPE_CPU_ID values
+    // with INVALID_CPU_MARKER_VALUE == ((uintptr_t)-1) and TRACE_MARKER_TYPE_PAGE_SIZE
+    // with 2k.
+    std::vector<uint64_t> modify_marker_value_pairs_list = { TRACE_MARKER_TYPE_CPU_ID,
+                                                             INVALID_CPU_MARKER_VALUE,
+                                                             TRACE_MARKER_TYPE_PAGE_SIZE,
+                                                             NEW_PAGE_SIZE_MARKER_VALUE };
+    std::vector<std::unique_ptr<record_filter_func_t>> filters;
+    auto modify_marker_value_filter = std::unique_ptr<record_filter_func_t>(
+        new dynamorio::drmemtrace::modify_marker_value_filter_t(
+            modify_marker_value_pairs_list));
+    if (!modify_marker_value_filter->get_error_string().empty()) {
+        fprintf(stderr, "Couldn't construct a modify_marker_value_filter %s",
+                modify_marker_value_filter->get_error_string().c_str());
+        return false;
+    }
+    filters.push_back(std::move(modify_marker_value_filter));
+
+    // Construct record_filter_t.
+    test_record_filter_t record_filter(std::move(filters), 0, /*write_archive=*/true);
+
+    // Run the test.
+    if (!process_entries_and_check_result(&record_filter, entries, 0))
+        return false;
+
+    fprintf(stderr, "test_modify_marker_value_filter passed\n");
+    return true;
+}
+
 static bool
 test_cache_and_type_filter()
 {
@@ -1450,7 +1525,7 @@ test_main(int argc, const char *argv[])
     dr_standalone_init();
     if (!test_cache_and_type_filter() || !test_chunk_update() || !test_trim_filter() ||
         !test_null_filter() || !test_wait_filter() || !test_encodings2regdeps_filter() ||
-        !test_func_id_filter())
+        !test_func_id_filter() || !test_modify_marker_value_filter())
         return 1;
     fprintf(stderr, "All done!\n");
     dr_standalone_exit();
diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp
index d88977f2dbd..5c121c947cb 100644
--- a/clients/drcachesim/tests/scheduler_unit_tests.cpp
+++ b/clients/drcachesim/tests/scheduler_unit_tests.cpp
@@ -38,6 +38,7 @@
 #include <cstddef>
 #include <cstring>
 #include <iostream>
+#include <random>
 #include <set>
 #include <string>
 #include <thread>
@@ -1598,6 +1599,8 @@ test_synthetic_with_timestamps()
                                                scheduler_t::SCHEDULER_DEFAULTS,
                                                /*verbosity=*/3);
     sched_ops.quantum_duration_instrs = 3;
+    // Test dropping a final "_" from core0.
+    sched_ops.exit_if_fraction_inputs_left = 0.1;
     scheduler_t scheduler;
     if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
         scheduler_t::STATUS_SUCCESS)
@@ -1615,9 +1618,8 @@ test_synthetic_with_timestamps()
     // thus core0 has C,I,B,H,D and core1 has F,J,E,A,G.
     // We should interleave within each group -- except once we reach J
     // we should completely finish it.  There should be no migrations.
-    assert(
-        sched_as_string[0] ==
-        ".CC.C.II.IC.CC.I.II.CC.C.II.I..BB.B.HH.HB.BB.H.HH.BB.B.HH.H..DD.DD.DD.DD.D._");
+    assert(sched_as_string[0] ==
+           ".CC.C.II.IC.CC.I.II.CC.C.II.I..BB.B.HH.HB.BB.H.HH.BB.B.HH.H..DD.DD.DD.DD.D.");
     assert(sched_as_string[1] ==
            ".FF.F.JJ.JJ.JJ.JJ.J.F.FF.FF.F..EE.EE.EE.EE.E..AA.A.GG.GA.AA.G.GG.AA.A.GG.G.");
     // Check scheduler stats.  # switches is the # of letter transitions; # preempts
@@ -1625,7 +1627,7 @@ test_synthetic_with_timestamps()
     // appearing in between (and ignoring the last letter for an input: EOF doesn't
     // count as a preempt).
     verify_scheduler_stats(scheduler.get_stream(0), /*switch_input_to_input=*/12,
-                           /*switch_input_to_idle=*/1, /*switch_idle_to_input=*/0,
+                           /*switch_input_to_idle=*/0, /*switch_idle_to_input=*/0,
                            /*switch_nop=*/2, /*preempts=*/10, /*direct_attempts=*/0,
                            /*direct_successes=*/0, /*migrations=*/0);
     verify_scheduler_stats(scheduler.get_stream(1), /*switch_input_to_input=*/9,
@@ -1699,6 +1701,8 @@ test_synthetic_with_priorities()
                                                scheduler_t::SCHEDULER_DEFAULTS,
                                                /*verbosity=*/3);
     sched_ops.quantum_duration_instrs = 3;
+    // Test dropping a final "_" from core0.
+    sched_ops.exit_if_fraction_inputs_left = 0.1;
     scheduler_t scheduler;
     if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
         scheduler_t::STATUS_SUCCESS)
@@ -1711,9 +1715,8 @@ test_synthetic_with_priorities()
     // See the test_synthetic_with_timestamps() test which has our base sequence.
     // We've elevated B, E, and H to higher priorities so they go
     // first.  J remains uninterrupted due to lower timestamps.
-    assert(
-        sched_as_string[0] ==
-        ".BB.B.HH.HB.BB.H.HH.BB.B.HH.H..FF.F.JJ.JJ.JJ.JJ.J.F.FF.FF.F..DD.DD.DD.DD.D._");
+    assert(sched_as_string[0] ==
+           ".BB.B.HH.HB.BB.H.HH.BB.B.HH.H..FF.F.JJ.JJ.JJ.JJ.J.F.FF.FF.F..DD.DD.DD.DD.D.");
     assert(sched_as_string[1] ==
            ".EE.EE.EE.EE.E..CC.C.II.IC.CC.I.II.CC.C.II.I..AA.A.GG.GA.AA.G.GG.AA.A.GG.G.");
     // Check scheduler stats.  # switches is the # of letter transitions; # preempts
@@ -1721,7 +1724,7 @@ test_synthetic_with_priorities()
     // appearing in between (and ignoring the last letter for an input: EOF doesn't
     // count as a preempt).
     verify_scheduler_stats(scheduler.get_stream(0), /*switch_input_to_input=*/9,
-                           /*switch_input_to_idle=*/1, /*switch_idle_to_input=*/0,
+                           /*switch_input_to_idle=*/0, /*switch_idle_to_input=*/0,
                            /*switch_nop=*/5, /*preempts=*/10, /*direct_attempts=*/0,
                            /*direct_successes=*/0, /*migrations=*/0);
     verify_scheduler_stats(scheduler.get_stream(1), /*switch_input_to_input=*/12,
@@ -2032,6 +2035,8 @@ test_synthetic_with_syscalls_multiple()
     sched_ops.time_units_per_us = 1.;
     sched_ops.blocking_switch_threshold = BLOCK_LATENCY;
     sched_ops.block_time_multiplier = BLOCK_SCALE;
+    // Test dropping a bunch of final "_" from core1.
+    sched_ops.exit_if_fraction_inputs_left = 0.1;
     scheduler_t scheduler;
     if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
         scheduler_t::STATUS_SUCCESS)
@@ -2054,9 +2059,8 @@ test_synthetic_with_syscalls_multiple()
     // explains why the two strings are different lengths.
     assert(sched_as_string[0] ==
            "BHHHFFFJJJJJJBHHHJJJFFFFFFBHHHDDDDDDDDDB__________B__________B__________B____"
-           "______B_______B");
-    assert(sched_as_string[1] ==
-           "EECCCIIICCCIIIEECCCIIIAAAGGGEEAAAGGEEGAAEGGAG_________");
+           "______B__________B");
+    assert(sched_as_string[1] == "EECCCIIICCCIIIEECCCIIIAAAGGGEEAAAGGEEGAAEGGAG");
     // Check scheduler stats.  # switches is the # of letter transitions; # preempts
     // is the instances where the same letter appears 3 times without another letter
     // appearing in between (and ignoring the last letter for an input: EOF doesn't
@@ -2066,7 +2070,7 @@ test_synthetic_with_syscalls_multiple()
                            /*switch_nop=*/4, /*preempts=*/10, /*direct_attempts=*/0,
                            /*direct_successes=*/0, /*migrations=*/0);
     verify_scheduler_stats(scheduler.get_stream(1), /*switch_input_to_input=*/19,
-                           /*switch_input_to_idle=*/1, /*switch_idle_to_input=*/0,
+                           /*switch_input_to_idle=*/0, /*switch_idle_to_input=*/0,
                            /*switch_nop=*/3, /*preempts=*/16, /*direct_attempts=*/0,
                            /*direct_successes=*/0, /*migrations=*/0);
 }
@@ -2242,7 +2246,7 @@ test_synthetic_with_syscalls_precise()
     scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
                                                scheduler_t::DEPENDENCY_TIMESTAMPS,
                                                scheduler_t::SCHEDULER_DEFAULTS,
-                                               /*verbosity=*/4);
+                                               /*verbosity=*/3);
     sched_ops.blocking_switch_threshold = BLOCK_THRESHOLD;
     scheduler_t scheduler;
     if (scheduler.init(sched_inputs, 1, std::move(sched_ops)) !=
@@ -2346,7 +2350,7 @@ test_synthetic_with_syscalls_latencies()
     scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
                                                scheduler_t::DEPENDENCY_TIMESTAMPS,
                                                scheduler_t::SCHEDULER_DEFAULTS,
-                                               /*verbosity=*/4);
+                                               /*verbosity=*/3);
     // We use a mock time for a deterministic result.
     sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
     sched_ops.time_units_per_us = 1.;
@@ -3291,7 +3295,8 @@ test_replay_skip()
             case 1:
                 assert(memref.marker.type == TRACE_TYPE_MARKER);
                 assert(memref.marker.marker_type == TRACE_MARKER_TYPE_CPU_ID);
-                assert(memref.marker.marker_value == 1);
+                // The value should be replaced by the shard id == 0.
+                assert(memref.marker.marker_value == 0);
                 break;
             case 2:
                 assert(type_is_instr(memref.instr.type));
@@ -3305,12 +3310,14 @@ test_replay_skip()
             case 4:
                 assert(memref.marker.type == TRACE_TYPE_MARKER);
                 assert(memref.marker.marker_type == TRACE_MARKER_TYPE_TIMESTAMP);
-                assert(memref.marker.marker_value == 40);
+                // The value should be replaced by a synthetic value: the initial (10)
+                // won't have advanced to the next microsecond.
+                assert(memref.marker.marker_value == 10);
                 break;
             case 5:
                 assert(memref.marker.type == TRACE_TYPE_MARKER);
                 assert(memref.marker.marker_type == TRACE_MARKER_TYPE_CPU_ID);
-                assert(memref.marker.marker_value == 4);
+                assert(memref.marker.marker_value == 0);
                 break;
             case 6:
                 assert(type_is_instr(memref.instr.type));
@@ -3319,12 +3326,12 @@ test_replay_skip()
             case 7:
                 assert(memref.marker.type == TRACE_TYPE_MARKER);
                 assert(memref.marker.marker_type == TRACE_MARKER_TYPE_TIMESTAMP);
-                assert(memref.marker.marker_value == 50);
+                assert(memref.marker.marker_value == 10);
                 break;
             case 8:
                 assert(memref.marker.type == TRACE_TYPE_MARKER);
                 assert(memref.marker.marker_type == TRACE_MARKER_TYPE_CPU_ID);
-                assert(memref.marker.marker_value == 5);
+                assert(memref.marker.marker_value == 0);
                 break;
             case 9:
                 assert(type_is_instr(memref.instr.type));
@@ -6001,6 +6008,295 @@ test_rebalancing()
     }
 }
 
+static void
+test_initial_migrate()
+{
+    std::cerr << "\n----------------\nTesting initial migrations\n";
+    // We want to ensures migration thresholds are applied to never-executed inputs.
+    static constexpr int NUM_OUTPUTS = 2;
+    static constexpr memref_tid_t TID_BASE = 100;
+    static constexpr memref_tid_t TID_A = TID_BASE + 0;
+    static constexpr memref_tid_t TID_B = TID_BASE + 1;
+    static constexpr memref_tid_t TID_C = TID_BASE + 2;
+    static constexpr uint64_t TIMESTAMP_START = 10;
+
+    // We have 3 inputs and 2 outputs. We expect a round-robin initial assignment
+    // to put A and C on output #0 and B on #1.
+    // B will finish #1 and then try to steal C from A but should fail if initial
+    // migrations have to wait for the threshold as though the input just ran
+    // right before the trace started, which is how we treat them now.
+    std::vector<trace_entry_t> refs_A = {
+        /* clang-format off */
+        make_thread(TID_A),
+        make_pid(1),
+        make_version(4),
+        make_timestamp(TIMESTAMP_START),
+        make_marker(TRACE_MARKER_TYPE_CPU_ID, 0),
+        make_instr(10),
+        make_instr(11),
+        make_instr(12),
+        make_instr(13),
+        make_instr(14),
+        make_instr(15),
+        make_exit(TID_A),
+        /* clang-format on */
+    };
+    std::vector<trace_entry_t> refs_B = {
+        /* clang-format off */
+        make_thread(TID_B),
+        make_pid(1),
+        make_version(4),
+        make_timestamp(TIMESTAMP_START),
+        make_marker(TRACE_MARKER_TYPE_CPU_ID, 0),
+        make_instr(20),
+        make_exit(TID_B),
+        /* clang-format on */
+    };
+    std::vector<trace_entry_t> refs_C = {
+        /* clang-format off */
+        make_thread(TID_C),
+        make_pid(1),
+        make_version(4),
+        make_timestamp(TIMESTAMP_START + 10),
+        make_marker(TRACE_MARKER_TYPE_CPU_ID, 0),
+        make_instr(30),
+        make_instr(31),
+        make_instr(32),
+        make_exit(TID_C),
+        /* clang-format on */
+    };
+
+    std::vector<scheduler_t::input_reader_t> readers;
+    readers.emplace_back(std::unique_ptr<mock_reader_t>(new mock_reader_t(refs_A)),
+                         std::unique_ptr<mock_reader_t>(new mock_reader_t()), TID_A);
+    readers.emplace_back(std::unique_ptr<mock_reader_t>(new mock_reader_t(refs_B)),
+                         std::unique_ptr<mock_reader_t>(new mock_reader_t()), TID_B);
+    readers.emplace_back(std::unique_ptr<mock_reader_t>(new mock_reader_t(refs_C)),
+                         std::unique_ptr<mock_reader_t>(new mock_reader_t()), TID_C);
+    std::vector<scheduler_t::input_workload_t> sched_inputs;
+    sched_inputs.emplace_back(std::move(readers));
+    scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
+                                               scheduler_t::DEPENDENCY_TIMESTAMPS,
+                                               scheduler_t::SCHEDULER_DEFAULTS,
+                                               /*verbosity=*/3);
+    scheduler_t scheduler;
+    if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
+        scheduler_t::STATUS_SUCCESS)
+        assert(false);
+    std::vector<std::string> sched_as_string =
+        run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/true);
+    // We should see zero migrations since output #1 failed to steal C from output #0.
+    static const char *const CORE0_SCHED_STRING = "...AAAAAA....CCC.";
+    static const char *const CORE1_SCHED_STRING = "...B.____________";
+    for (int i = 0; i < NUM_OUTPUTS; i++) {
+        assert(scheduler.get_stream(i)->get_schedule_statistic(
+                   memtrace_stream_t::SCHED_STAT_MIGRATIONS) == 0);
+        std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n";
+    }
+    assert(sched_as_string[0] == CORE0_SCHED_STRING);
+    assert(sched_as_string[1] == CORE1_SCHED_STRING);
+}
+
+static void
+test_exit_early()
+{
+    std::cerr << "\n----------------\nTesting exiting early\n";
+    static constexpr int NUM_INPUTS = 12;
+    static constexpr int NUM_OUTPUTS = 2;
+    static constexpr int NUM_INSTRS = 9;
+    static constexpr int QUANTUM_DURATION = 3;
+    static constexpr memref_tid_t TID_BASE = 100;
+    static constexpr uint64_t TIMESTAMP = 101;
+    static constexpr uint64_t BLOCK_LATENCY = 1500;
+    std::vector<trace_entry_t> inputs[NUM_INPUTS];
+    for (int i = 0; i < NUM_INPUTS; i++) {
+        memref_tid_t tid = TID_BASE + i;
+        inputs[i].push_back(make_thread(tid));
+        inputs[i].push_back(make_pid(1));
+        inputs[i].push_back(make_version(TRACE_ENTRY_VERSION));
+        inputs[i].push_back(make_timestamp(TIMESTAMP)); // All the same time priority.
+        for (int j = 0; j < NUM_INSTRS; j++) {
+            inputs[i].push_back(make_instr(42 + j * 4));
+            // One input has a long blocking syscall toward the end.
+            if (i == 0 && j == NUM_INSTRS - 2) {
+                inputs[i].push_back(make_timestamp(TIMESTAMP));
+                inputs[i].push_back(make_marker(TRACE_MARKER_TYPE_SYSCALL, 42));
+                inputs[i].push_back(
+                    make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0));
+                inputs[i].push_back(make_timestamp(TIMESTAMP + BLOCK_LATENCY));
+            }
+        }
+        inputs[i].push_back(make_exit(tid));
+    }
+    {
+        // Run without any early exit.
+        std::vector<scheduler_t::input_workload_t> sched_inputs;
+        for (int i = 0; i < NUM_INPUTS; i++) {
+            std::vector<scheduler_t::input_reader_t> readers;
+            readers.emplace_back(
+                std::unique_ptr<mock_reader_t>(new mock_reader_t(inputs[i])),
+                std::unique_ptr<mock_reader_t>(new mock_reader_t()), TID_BASE + i);
+            sched_inputs.emplace_back(std::move(readers));
+        }
+        scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
+                                                   scheduler_t::DEPENDENCY_IGNORE,
+                                                   scheduler_t::SCHEDULER_DEFAULTS,
+                                                   /*verbosity=*/2);
+        // We use our mock's time==instruction count for a deterministic result.
+        sched_ops.time_units_per_us = 1.;
+        sched_ops.quantum_duration_instrs = QUANTUM_DURATION;
+        sched_ops.blocking_switch_threshold = BLOCK_LATENCY;
+        sched_ops.exit_if_fraction_inputs_left = 0.;
+        scheduler_t scheduler;
+        if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
+            scheduler_t::STATUS_SUCCESS)
+            assert(false);
+        std::vector<std::string> sched_as_string =
+            run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/true);
+        for (int i = 0; i < NUM_OUTPUTS; i++) {
+            std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n";
+        }
+        // We have a long idle wait just to execute A's final instruction.
+        static const char *const CORE0_SCHED_STRING =
+            "..AAA..CCC..EEE..GGG..III..KKKAAACCCEEEGGGIIIKKKAA....CCC.EEE.GGG.III.KKK.__"
+            "_________________________________________________________________A.";
+        static const char *const CORE1_SCHED_STRING =
+            "..BBB..DDD..FFF..HHH..JJJ..LLLBBBDDDFFFHHHJJJLLLBBB.DDD.FFF.HHH.JJJ.LLL.____"
+            "___________________________________________________________________";
+        assert(sched_as_string[0] == CORE0_SCHED_STRING);
+        assert(sched_as_string[1] == CORE1_SCHED_STRING);
+    }
+    {
+        // Run with any early exit.
+        std::vector<scheduler_t::input_workload_t> sched_inputs;
+        for (int i = 0; i < NUM_INPUTS; i++) {
+            std::vector<scheduler_t::input_reader_t> readers;
+            readers.emplace_back(
+                std::unique_ptr<mock_reader_t>(new mock_reader_t(inputs[i])),
+                std::unique_ptr<mock_reader_t>(new mock_reader_t()), TID_BASE + i);
+            sched_inputs.emplace_back(std::move(readers));
+        }
+        scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
+                                                   scheduler_t::DEPENDENCY_IGNORE,
+                                                   scheduler_t::SCHEDULER_DEFAULTS,
+                                                   /*verbosity=*/2);
+        // We use our mock's time==instruction count for a deterministic result.
+        sched_ops.time_units_per_us = 1.;
+        sched_ops.quantum_duration_instrs = QUANTUM_DURATION;
+        sched_ops.blocking_switch_threshold = BLOCK_LATENCY;
+        // NUM_INPUTS=11 * 0.1 = 1.1 so we'll exit with 1 input left.
+        sched_ops.exit_if_fraction_inputs_left = 0.1;
+        scheduler_t scheduler;
+        if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
+            scheduler_t::STATUS_SUCCESS)
+            assert(false);
+        std::vector<std::string> sched_as_string =
+            run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/true);
+        for (int i = 0; i < NUM_OUTPUTS; i++) {
+            std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n";
+        }
+        // Now we exit after K and never execute the 9th A.
+        static const char *const CORE0_SCHED_STRING =
+            "..AAA..CCC..EEE..GGG..III..KKKAAACCCEEEGGGIIIKKKAA....CCC.EEE.GGG.III.KKK.";
+        static const char *const CORE1_SCHED_STRING =
+            "..BBB..DDD..FFF..HHH..JJJ..LLLBBBDDDFFFHHHJJJLLLBBB.DDD.FFF.HHH.JJJ.LLL.__";
+        assert(sched_as_string[0] == CORE0_SCHED_STRING);
+        assert(sched_as_string[1] == CORE1_SCHED_STRING);
+    }
+}
+
+static void
+test_marker_updates()
+{
+    std::cerr << "\n----------------\nTesting marker updates\n";
+    static constexpr int NUM_INPUTS = 5;
+    static constexpr int NUM_OUTPUTS = 3;
+    // We need at least enough instrs to cover INSTRS_PER_US==2000.
+    static constexpr int NUM_INSTRS = 10000;
+    static constexpr memref_tid_t TID_BASE = 100;
+    static constexpr uint64_t TIMESTAMP_BASE = 12340000;
+
+    std::vector<trace_entry_t> inputs[NUM_INPUTS];
+
+    std::minstd_rand rand_gen;
+    rand_gen.seed(static_cast<int>(reinterpret_cast<int64_t>(&inputs[0])));
+
+    for (int i = 0; i < NUM_INPUTS; i++) {
+        memref_tid_t tid = TID_BASE + i;
+        inputs[i].push_back(make_thread(tid));
+        inputs[i].push_back(make_pid(1));
+        inputs[i].push_back(make_version(TRACE_ENTRY_VERSION));
+        // Add a randomly-increasing-value timestamp.
+        uint64_t cur_timestamp = TIMESTAMP_BASE;
+        cur_timestamp += rand_gen();
+        inputs[i].push_back(make_timestamp(cur_timestamp));
+        // Add a cpuid with a random value.
+        inputs[i].push_back(make_marker(TRACE_MARKER_TYPE_CPU_ID, rand_gen()));
+        for (int j = 0; j < NUM_INSTRS; j++) {
+            inputs[i].push_back(make_instr(42 + j * 4));
+            // Add a randomly-increasing-value timestamp.
+            cur_timestamp += rand_gen();
+            inputs[i].push_back(make_timestamp(cur_timestamp));
+            // Add a cpuid with a random value.
+            inputs[i].push_back(make_marker(TRACE_MARKER_TYPE_CPU_ID, rand_gen()));
+        }
+        inputs[i].push_back(make_exit(tid));
+    }
+    std::vector<scheduler_t::input_workload_t> sched_inputs;
+    for (int i = 0; i < NUM_INPUTS; i++) {
+        std::vector<scheduler_t::input_reader_t> readers;
+        readers.emplace_back(std::unique_ptr<mock_reader_t>(new mock_reader_t(inputs[i])),
+                             std::unique_ptr<mock_reader_t>(new mock_reader_t()),
+                             TID_BASE + i);
+        sched_inputs.emplace_back(std::move(readers));
+    }
+    scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
+                                               scheduler_t::DEPENDENCY_IGNORE,
+                                               scheduler_t::SCHEDULER_DEFAULTS,
+                                               /*verbosity=*/2);
+    scheduler_t scheduler;
+    if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
+        scheduler_t::STATUS_SUCCESS)
+        assert(false);
+    std::vector<scheduler_t::stream_t *> outputs(NUM_OUTPUTS, nullptr);
+    std::vector<uintptr_t> last_timestamp(NUM_OUTPUTS, 0);
+    std::vector<bool> eof(NUM_OUTPUTS, false);
+    for (int i = 0; i < NUM_OUTPUTS; i++)
+        outputs[i] = scheduler.get_stream(i);
+    int num_eof = 0;
+    while (num_eof < NUM_OUTPUTS) {
+        for (int i = 0; i < NUM_OUTPUTS; i++) {
+            if (eof[i])
+                continue;
+            memref_t memref;
+            scheduler_t::stream_status_t status = outputs[i]->next_record(memref);
+            if (status == scheduler_t::STATUS_EOF) {
+                ++num_eof;
+                eof[i] = true;
+                continue;
+            }
+            if (status == scheduler_t::STATUS_IDLE)
+                continue;
+            assert(status == scheduler_t::STATUS_OK);
+            if (memref.marker.type != TRACE_TYPE_MARKER)
+                continue;
+            // Make sure the random values have some order now, satisfying invariants.
+            if (memref.marker.marker_type == TRACE_MARKER_TYPE_TIMESTAMP) {
+                assert(memref.marker.marker_value >= last_timestamp[i]);
+                last_timestamp[i] = memref.marker.marker_value;
+            } else if (memref.marker.marker_type == TRACE_MARKER_TYPE_CPU_ID) {
+                assert(memref.marker.marker_value ==
+                       static_cast<uintptr_t>(outputs[i]->get_shard_index()));
+            }
+        }
+    }
+    // Ensure we didn't short-circuit or exit early.
+    uint64_t instrs_seen = 0;
+    for (int i = 0; i < NUM_OUTPUTS; i++)
+        instrs_seen += outputs[i]->get_instruction_ordinal();
+    assert(instrs_seen == NUM_INPUTS * NUM_INSTRS);
+}
+
 int
 test_main(int argc, const char *argv[])
 {
@@ -6042,6 +6338,9 @@ test_main(int argc, const char *argv[])
     test_random_schedule();
     test_record_scheduler();
     test_rebalancing();
+    test_initial_migrate();
+    test_exit_early();
+    test_marker_updates();
 
     dr_standalone_exit();
     return 0;
diff --git a/clients/drcachesim/tools/filter/modify_marker_value_filter.h b/clients/drcachesim/tools/filter/modify_marker_value_filter.h
new file mode 100644
index 00000000000..7833ba76826
--- /dev/null
+++ b/clients/drcachesim/tools/filter/modify_marker_value_filter.h
@@ -0,0 +1,111 @@
+/* **********************************************************
+ * Copyright (c) 2024 Google, Inc.  All rights reserved.
+ * **********************************************************/
+
+/*
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * * Neither the name of Google, Inc. nor the names of its contributors may be
+ *   used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL VMWARE, INC. OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+#ifndef _MODIFY_MARKER_VALUE_FILTER_H_
+#define _MODIFY_MARKER_VALUE_FILTER_H_ 1
+
+#include "record_filter.h"
+#include "trace_entry.h"
+
+#include <cstring>
+#include <unordered_map>
+
+namespace dynamorio {
+namespace drmemtrace {
+
+/* This filter takes a list of <TRACE_MARKER_TYPE_,new_value> pairs and modifies the value
+ * of all listed markers in the trace with the given new_value.
+ */
+class modify_marker_value_filter_t : public record_filter_t::record_filter_func_t {
+public:
+    modify_marker_value_filter_t(std::vector<uint64_t> modify_marker_value_pairs_list)
+    {
+        size_t list_size = modify_marker_value_pairs_list.size();
+        if (list_size == 0) {
+            error_string_ = "List of <TRACE_MARKER_TYPE_,new_value> pairs is empty.";
+        } else if (list_size % 2 != 0) {
+            error_string_ = "List of <TRACE_MARKER_TYPE_,new_value> pairs is missing "
+                            "part of a pair as its size is not even";
+        } else {
+            for (size_t i = 0; i < list_size; i += 2) {
+                trace_marker_type_t marker_type =
+                    static_cast<trace_marker_type_t>(modify_marker_value_pairs_list[i]);
+                uint64_t new_value = modify_marker_value_pairs_list[i + 1];
+                // We ignore duplicate pairs and use the last pair in the list.
+                marker_to_value_map_[marker_type] = new_value;
+            }
+        }
+    }
+
+    void *
+    parallel_shard_init(memtrace_stream_t *shard_stream,
+                        bool partial_trace_filter) override
+    {
+        return nullptr;
+    }
+
+    bool
+    parallel_shard_filter(
+        trace_entry_t &entry, void *shard_data,
+        record_filter_t::record_filter_info_t &record_filter_info) override
+    {
+        trace_type_t entry_type = static_cast<trace_type_t>(entry.type);
+        // Output any trace_entry_t that's not a marker.
+        if (entry_type != TRACE_TYPE_MARKER)
+            return true;
+
+        // Check if the TRACE_TYPE_MARKER_ is in the list of markers for which we want to
+        // overwrite their value. If not, output the marker unchanged.
+        trace_marker_type_t marker_type = static_cast<trace_marker_type_t>(entry.size);
+        const auto &it = marker_to_value_map_.find(marker_type);
+        if (it == marker_to_value_map_.end())
+            return true;
+
+        // Overwrite marker value.
+        entry.addr = static_cast<addr_t>(it->second);
+
+        return true;
+    }
+
+    bool
+    parallel_shard_exit(void *shard_data) override
+    {
+        return true;
+    }
+
+private:
+    std::unordered_map<trace_marker_type_t, uint64_t> marker_to_value_map_;
+};
+
+} // namespace drmemtrace
+} // namespace dynamorio
+#endif /* _MODIFY_MARKER_VALUE_FILTER_H_ */
diff --git a/clients/drcachesim/tools/filter/record_filter.cpp b/clients/drcachesim/tools/filter/record_filter.cpp
index 9d038583b61..4a52c8b0ef8 100644
--- a/clients/drcachesim/tools/filter/record_filter.cpp
+++ b/clients/drcachesim/tools/filter/record_filter.cpp
@@ -62,6 +62,7 @@
 #include "type_filter.h"
 #include "encodings2regdeps_filter.h"
 #include "func_id_filter.h"
+#include "modify_marker_value_filter.h"
 
 #undef VPRINT
 #ifdef DEBUG
@@ -95,7 +96,9 @@ parse_string(const std::string &s, char sep = ',')
     std::vector<T> vec;
     do {
         pos = s.find(sep, at);
-        unsigned long long parsed_number = std::stoull(s.substr(at, pos));
+        // base = 0 allows to handle both decimal and hex numbers.
+        unsigned long long parsed_number =
+            std::stoull(s.substr(at, pos), nullptr, /*base = */ 0);
         // XXX: parsed_number may be truncated if T is not large enough.
         // We could check that parsed_number is within the limits of T using
         // std::numeric_limits<>::min()/max(), but this returns 0 on T that are enums,
@@ -119,7 +122,7 @@ record_filter_tool_create(const std::string &output_dir, uint64_t stop_timestamp
                           const std::string &remove_marker_types,
                           uint64_t trim_before_timestamp, uint64_t trim_after_timestamp,
                           bool encodings2regdeps, const std::string &keep_func_ids,
-                          unsigned int verbose)
+                          const std::string &modify_marker_value, unsigned int verbose)
 {
     std::vector<
         std::unique_ptr<dynamorio::drmemtrace::record_filter_t::record_filter_func_t>>
@@ -160,6 +163,14 @@ record_filter_tool_create(const std::string &output_dir, uint64_t stop_timestamp
             std::unique_ptr<dynamorio::drmemtrace::record_filter_t::record_filter_func_t>(
                 new dynamorio::drmemtrace::func_id_filter_t(keep_func_ids_list)));
     }
+    if (!modify_marker_value.empty()) {
+        std::vector<uint64_t> modify_marker_value_pairs_list =
+            parse_string<uint64_t>(modify_marker_value);
+        filter_funcs.emplace_back(
+            std::unique_ptr<dynamorio::drmemtrace::record_filter_t::record_filter_func_t>(
+                new dynamorio::drmemtrace::modify_marker_value_filter_t(
+                    modify_marker_value_pairs_list)));
+    }
 
     // TODO i#5675: Add other filters.
 
diff --git a/clients/drcachesim/tools/filter/record_filter_create.h b/clients/drcachesim/tools/filter/record_filter_create.h
index 0516d7bb713..d48513a4255 100644
--- a/clients/drcachesim/tools/filter/record_filter_create.h
+++ b/clients/drcachesim/tools/filter/record_filter_create.h
@@ -67,6 +67,9 @@ namespace drmemtrace {
  * @param[in] keep_func_ids  A comma-separated list of integers representing the
  *   function IDs related to #TRACE_MARKER_TYPE_FUNC_ID (and _ARG, _RETVAL, _RETADDR)
  *   markers to preserve in the trace, while removing all other function markers.
+ * @param[in] modify_marker_value A list of comma-separated pairs of integers representing
+ *   <TRACE_MARKER_TYPE_, new_value> to modify the value of all listed TRACE_MARKER_TYPE_
+ *   in the trace with their corresponding new_value.
  * @param[in] verbose  Verbosity level for notifications.
  */
 record_analysis_tool_t *
@@ -75,7 +78,7 @@ record_filter_tool_create(const std::string &output_dir, uint64_t stop_timestamp
                           const std::string &remove_marker_types,
                           uint64_t trim_before_timestamp, uint64_t trim_after_timestamp,
                           bool encodings2regdeps, const std::string &keep_func_ids,
-                          unsigned int verbose);
+                          const std::string &modify_marker_value, unsigned int verbose);
 
 } // namespace drmemtrace
 } // namespace dynamorio
diff --git a/clients/drcachesim/tools/record_filter_launcher.cpp b/clients/drcachesim/tools/record_filter_launcher.cpp
index e1a8bb7a481..bcc9f2c1098 100644
--- a/clients/drcachesim/tools/record_filter_launcher.cpp
+++ b/clients/drcachesim/tools/record_filter_launcher.cpp
@@ -138,6 +138,16 @@ droption_t<std::string>
                        "TRACE_MARKER_TYPE_FUNC_[ID | ARG | RETVAL | RETADDR] "
                        "markers for the listed function IDs and removed those "
                        "belonging to unlisted function IDs.");
+
+droption_t<std::string> op_modify_marker_value(
+    DROPTION_SCOPE_FRONTEND, "filter_modify_marker_value", "",
+    "Comma-separated pairs of integers representing <TRACE_MARKER_TYPE_, new_value>.",
+    "This option is for -tool record_filter. It modifies the value of all listed "
+    "TRACE_MARKER_TYPE_ markers in the trace with their corresponding new_value. "
+    "The list must have an even size. Example: -filter_modify_marker_value 3,24,18,2048 "
+    "sets all TRACE_MARKER_TYPE_CPU_ID == 3 in the trace to core 24 and "
+    "TRACE_MARKER_TYPE_PAGE_SIZE == 18 to 2k.");
+
 } // namespace
 
 int
@@ -168,7 +178,8 @@ _tmain(int argc, const TCHAR *targv[])
             op_cache_filter_size.get_value(), op_remove_trace_types.get_value(),
             op_remove_marker_types.get_value(), op_trim_before_timestamp.get_value(),
             op_trim_after_timestamp.get_value(), op_encodings2regdeps.get_value(),
-            op_filter_func_ids.get_value(), op_verbose.get_value()));
+            op_filter_func_ids.get_value(), op_modify_marker_value.get_value(),
+            op_verbose.get_value()));
     std::vector<record_analysis_tool_t *> tools;
     tools.push_back(record_filter.get());
 
diff --git a/clients/drcachesim/tools/schedule_stats.h b/clients/drcachesim/tools/schedule_stats.h
index db3e217b5d8..61e175a4c5e 100644
--- a/clients/drcachesim/tools/schedule_stats.h
+++ b/clients/drcachesim/tools/schedule_stats.h
@@ -65,6 +65,11 @@ class schedule_stats_t : public analysis_tool_t {
     print_results() override;
     bool
     parallel_shard_supported() override;
+    shard_type_t
+    preferred_shard_type() override
+    {
+        return SHARD_BY_CORE;
+    }
     void *
     parallel_shard_init_stream(int shard_index, void *worker_data,
                                memtrace_stream_t *stream) override;
diff --git a/clients/drcachesim/tools/view.cpp b/clients/drcachesim/tools/view.cpp
index 7817169c89d..702667db73c 100644
--- a/clients/drcachesim/tools/view.cpp
+++ b/clients/drcachesim/tools/view.cpp
@@ -335,8 +335,13 @@ view_t::parallel_shard_memref(void *shard_data, const memref_t &memref)
             // see a cpuid marker on a thread switch.  To avoid that assumption
             // we would want to track the prior tid and print out a thread switch
             // message whenever it changes.
-            std::cerr << "<marker: tid " << memref.marker.tid << " on core "
-                      << memref.marker.marker_value << ">\n";
+            if (memref.marker.marker_value == INVALID_CPU_MARKER_VALUE) {
+                std::cerr << "<marker: tid " << memref.marker.tid
+                          << " on core unknown>\n";
+            } else {
+                std::cerr << "<marker: tid " << memref.marker.tid << " on core "
+                          << memref.marker.marker_value << ">\n";
+            }
             break;
         case TRACE_MARKER_TYPE_KERNEL_EVENT:
             if (trace_version_ <= TRACE_ENTRY_VERSION_NO_KERNEL_PC) {
diff --git a/clients/drcachesim/tracer/syscall_pt_trace.cpp b/clients/drcachesim/tracer/syscall_pt_trace.cpp
index 24ebceb1f6a..5700e4abd91 100644
--- a/clients/drcachesim/tracer/syscall_pt_trace.cpp
+++ b/clients/drcachesim/tracer/syscall_pt_trace.cpp
@@ -157,7 +157,7 @@ syscall_pt_trace_t::start_syscall_pt_trace(DR_PARAM_IN int sysnum)
 }
 
 bool
-syscall_pt_trace_t::stop_syscall_pt_trace()
+syscall_pt_trace_t::stop_syscall_pt_trace(bool dump_to_trace)
 {
     ASSERT(is_initialized_, "syscall_pt_trace_t is not initialized");
     ASSERT(drcontext_ != nullptr, "drcontext_ is nullptr");
@@ -171,11 +171,13 @@ syscall_pt_trace_t::stop_syscall_pt_trace()
         return false;
     }
 
-    if (!trace_data_dump(pttracer_output_buffer_)) {
-        return false;
-    }
+    if (dump_to_trace) {
+        if (!trace_data_dump(pttracer_output_buffer_)) {
+            return false;
+        }
 
-    traced_syscall_idx_++;
+        ++traced_syscall_idx_;
+    }
     cur_recording_sysnum_ = -1;
 
     /* Reset the pttracer handle for next syscall.
diff --git a/clients/drcachesim/tracer/syscall_pt_trace.h b/clients/drcachesim/tracer/syscall_pt_trace.h
index d9af890d227..108474b8864 100644
--- a/clients/drcachesim/tracer/syscall_pt_trace.h
+++ b/clients/drcachesim/tracer/syscall_pt_trace.h
@@ -122,9 +122,11 @@ class syscall_pt_trace_t {
     bool
     start_syscall_pt_trace(DR_PARAM_IN int sysnum);
 
-    /* Stop the PT tracing for current syscall and dump the output data to one file. */
+    /* Stop the PT tracing for current syscall and dump the output data to the trace
+     * if dump_to_trace is set.
+     */
     bool
-    stop_syscall_pt_trace();
+    stop_syscall_pt_trace(bool dump_to_trace);
 
     /* Get the sysnum of current recording syscall. */
     int
diff --git a/clients/drcachesim/tracer/tracer.cpp b/clients/drcachesim/tracer/tracer.cpp
index 6d8f6cab7d1..88db853deb3 100644
--- a/clients/drcachesim/tracer/tracer.cpp
+++ b/clients/drcachesim/tracer/tracer.cpp
@@ -1499,27 +1499,28 @@ exit_record_syscall()
 
 #ifdef BUILD_PT_TRACER
 static bool
-stop_cur_syscall_pt_trace(void *drcontext, per_thread_t *data)
+stop_cur_syscall_pt_trace(void *drcontext, per_thread_t *data, bool dump_to_trace)
 {
     int cur_recording_sysnum = data->syscall_pt_trace.get_cur_recording_sysnum();
     ASSERT(cur_recording_sysnum != INVALID_SYSNUM,
            "Routine expected to be called only when PT tracing is active.");
-    /* Write a marker to the userspace raw trace that denotes where raw2trace
-     * should decode and insert the PT trace for the system call being
-     * recorded currently. Some drmemtrace derivations may interleave the PT
-     * trace raw data with the drmemtrace user-space raw trace data (instead of
-     * outputting the PT trace data to separate files like we do here). In such
-     * cases, we want to ensure that the TRACE_MARKER_TYPE_SYSCALL_IDX does not
-     * get output before the actual PT trace data, so we output the marker when
-     * we stop and write the PT trace (instead of when we start the PT trace).
-     * Note that the order below does not matter because the actual buffer
-     * flush happens later.
-     */
-    trace_marker_type_t marker_type = TRACE_MARKER_TYPE_SYSCALL_IDX;
-    uintptr_t marker_val = data->syscall_pt_trace.get_traced_syscall_idx();
-    BUF_PTR(data->seg_base) +=
-        instru->append_marker(BUF_PTR(data->seg_base), marker_type, marker_val);
-    if (!data->syscall_pt_trace.stop_syscall_pt_trace()) {
+    if (dump_to_trace) {
+        // Write a marker to the userspace raw trace that denotes where raw2trace
+        // should decode and insert the PT trace for the system call being
+        // recorded currently. Some drmemtrace derivations may interleave the PT
+        // trace raw data with the drmemtrace user-space raw trace data (instead of
+        // outputting the PT trace data to separate files like we do here). In such
+        // cases, we want to ensure that the TRACE_MARKER_TYPE_SYSCALL_IDX does not
+        // get output before the actual PT trace data, so we output the marker when
+        // we stop and write the PT trace (instead of when we start the PT trace).
+        // Note that the order below does not matter because the actual buffer
+        // flush happens later.
+        trace_marker_type_t marker_type = TRACE_MARKER_TYPE_SYSCALL_IDX;
+        uintptr_t marker_val = data->syscall_pt_trace.get_traced_syscall_idx();
+        BUF_PTR(data->seg_base) +=
+            instru->append_marker(BUF_PTR(data->seg_base), marker_type, marker_val);
+    }
+    if (!data->syscall_pt_trace.stop_syscall_pt_trace(dump_to_trace)) {
         NOTIFY(0,
                "ERROR: Failed to stop PT tracing for syscall %d of thread "
                "T%d.\n",
@@ -1611,7 +1612,13 @@ event_pre_syscall(void *drcontext, int sysnum)
                    "Last syscall tracing wasn't stopped when we reached the next one");
             // In the release build, in case we somehow did not stop the PT tracing, we
             // try to stop it and continue.
-            if (!stop_cur_syscall_pt_trace(drcontext, data))
+            // XXX: Something didn't go as expected as the last syscall PT trace was
+            // not stopped yet. We may need to find other control points where PT
+            // tracing needs to be stopped. E.g., PT tracing for syscalls interrupted by
+            // signals may need to be stopped in main_signal_handler. Though it has
+            // not been observed yet, the traces dumped below may have issues during
+            // decoding.
+            if (!stop_cur_syscall_pt_trace(drcontext, data, /*dump_to_trace=*/true))
                 return false;
         }
 
@@ -1688,7 +1695,7 @@ event_post_syscall(void *drcontext, int sysnum)
             ASSERT(syscall_pt_trace_t::is_syscall_pt_trace_enabled(cur_recording_sysnum),
                    "Did not expect syscall tracing to be enabled for this syscall");
             // Ignore return value and try to continue in release build.
-            stop_cur_syscall_pt_trace(drcontext, data);
+            stop_cur_syscall_pt_trace(drcontext, data, /*dump_to_trace=*/true);
         } else {
             // No syscall trace is being recorded. This may be because syscall tracing
             // is not enabled for sysnum, or that we were not in tracing mode at the
@@ -1861,11 +1868,17 @@ event_thread_exit(void *drcontext)
             int cur_recording_sysnum = data->syscall_pt_trace.get_cur_recording_sysnum();
             if (cur_recording_sysnum != INVALID_SYSNUM) {
                 NOTIFY(0,
-                       "ERROR: PT tracing for the last syscall %d of thread T%d was  "
+                       "ERROR: PT tracing for the last syscall %d of thread T%d was "
                        "found active at detach.\n",
                        cur_recording_sysnum, dr_get_thread_id(drcontext));
                 // Ignore return value and try to continue in release build.
-                stop_cur_syscall_pt_trace(drcontext, data);
+                // We skip dumping the trace because the syscall was likely interrupted
+                // by the detach signal and does not represent the real app behavior.
+                // XXX: Can we somehow figure out how much of the PT trace we can keep?
+                // Such PT syscall traces at the thread's end have been seen to not
+                // decode successfully in libipt, particularly for syscalls like futex,
+                // and epoll_wait.
+                stop_cur_syscall_pt_trace(drcontext, data, /*dump_to_trace=*/false);
             }
         }
 #endif
diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt
index 8d739a44568..b23ebf77a66 100644
--- a/core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@@ -390,8 +390,8 @@ if (UNIX)
       set(OS_SRCS ${OS_SRCS} unix/loader_android.c)
     else ()
       set(OS_SRCS ${OS_SRCS} unix/loader_linux.c)
+      set(OS_SRCS ${OS_SRCS} unix/coredump.c)
     endif ()
-    set(OS_SRCS ${OS_SRCS} unix/coredump.c)
     set(OS_SRCS ${OS_SRCS} unix/memquery_linux.c)
     set(OS_SRCS ${OS_SRCS} unix/memquery.c)
     set(OS_SRCS ${OS_SRCS} unix/memcache.c)
diff --git a/core/heap.c b/core/heap.c
index 162264f5483..168084a8c72 100644
--- a/core/heap.c
+++ b/core/heap.c
@@ -786,6 +786,8 @@ vmm_place_vmcode(vm_heap_t *vmh, /*INOUT*/ size_t *size, heap_error_code_t *erro
 {
     ptr_uint_t preferred = 0;
 #ifdef X64
+    LOG(GLOBAL, LOG_HEAP, 1, "%s: vm heap allowed range " PFX "-" PFX "\n", __FUNCTION__,
+        heap_allowable_region_start, heap_allowable_region_end);
     /* -heap_in_lower_4GB takes top priority and has already set heap_allowable_region_*.
      * Next comes -vm_base_near_app.  It will fail for -vm_size=2G, which we document.
      */
@@ -838,20 +840,24 @@ vmm_place_vmcode(vm_heap_t *vmh, /*INOUT*/ size_t *size, heap_error_code_t *erro
 
     /* Next we try the -vm_base value plus a random offset. */
     if (vmh->start_addr == NULL) {
-        /* Out of 32 bits = 12 bits are page offset, windows wastes 4 more
-         * since its allocation base is 64KB, and if we want to stay
-         * safely in say 0x20000000-0x2fffffff we're left with only 12
-         * bits of randomness - which may be too little.  On the other
-         * hand changing any of the lower 16 bits will make our bugs
-         * non-deterministic. */
-        /* Make sure we don't waste the lower bits from our random number */
-        preferred = (DYNAMO_OPTION(vm_base) +
-                     get_random_offset(DYNAMO_OPTION(vm_max_offset) /
-                                       DYNAMO_OPTION(vmm_block_size)) *
-                         DYNAMO_OPTION(vmm_block_size));
-        preferred = ALIGN_FORWARD(preferred, OS_ALLOC_GRANULARITY);
-        /* overflow check: w/ vm_base shouldn't happen so debug-only check */
-        ASSERT(!POINTER_OVERFLOW_ON_ADD(preferred, *size));
+        if (DYNAMO_OPTION(vm_base) == 0) {
+            /* Let the OS pick where. */
+        } else {
+            /* Out of 32 bits = 12 bits are page offset, windows wastes 4 more
+             * since its allocation base is 64KB, and if we want to stay
+             * safely in say 0x20000000-0x2fffffff we're left with only 12
+             * bits of randomness - which may be too little.  On the other
+             * hand changing any of the lower 16 bits will make our bugs
+             * non-deterministic. */
+            /* Make sure we don't waste the lower bits from our random number */
+            preferred = (DYNAMO_OPTION(vm_base) +
+                         get_random_offset(DYNAMO_OPTION(vm_max_offset) /
+                                           DYNAMO_OPTION(vmm_block_size)) *
+                             DYNAMO_OPTION(vmm_block_size));
+            preferred = ALIGN_FORWARD(preferred, OS_ALLOC_GRANULARITY);
+            /* overflow check: w/ vm_base shouldn't happen so debug-only check */
+            ASSERT(!POINTER_OVERFLOW_ON_ADD(preferred, *size));
+        }
         /* let's assume a single chunk is sufficient to reserve */
 #ifdef X64
         if ((byte *)preferred < heap_allowable_region_start ||
@@ -866,8 +872,9 @@ vmm_place_vmcode(vm_heap_t *vmh, /*INOUT*/ size_t *size, heap_error_code_t *erro
                 os_heap_reserve((void *)preferred, *size, error_code, true /*+x*/);
             vmh->start_addr = vmh->alloc_start;
             LOG(GLOBAL, LOG_HEAP, 1,
-                "vmm_heap_unit_init preferred=" PFX " got start_addr=" PFX "\n",
-                preferred, vmh->start_addr);
+                "vmm_heap_unit_init preferred=" PFX " size=" PIFX " => start_addr=" PFX
+                " (code=0x%08x)\n",
+                preferred, *size, vmh->start_addr, *error_code);
 #ifdef X64
         }
 #endif
diff --git a/core/lib/instrument.c b/core/lib/instrument.c
index 56a3a2b0941..3b3590b42d3 100644
--- a/core/lib/instrument.c
+++ b/core/lib/instrument.c
@@ -2499,7 +2499,7 @@ dr_create_memory_dump(dr_memory_dump_spec_t *spec)
         return os_dump_core_live(spec->label, spec->ldmp_path, spec->ldmp_path_size);
 #elif defined(LINUX) && defined(X64)
     if (TEST(DR_MEMORY_DUMP_ELF, spec->flags))
-      return os_dump_core_live();
+        return os_dump_core_live();
 #endif
     return false;
 }
diff --git a/core/optionsx.h b/core/optionsx.h
index a35a3934502..e95d757cbdf 100644
--- a/core/optionsx.h
+++ b/core/optionsx.h
@@ -1,5 +1,5 @@
 /* *******************************************************************************
- * Copyright (c) 2010-2022 Google, Inc.  All rights reserved.
+ * Copyright (c) 2010-2024 Google, Inc.  All rights reserved.
  * Copyright (c) 2011 Massachusetts Institute of Technology  All rights reserved.
  * Copyright (c) 2003-2010 VMware, Inc.  All rights reserved.
  * *******************************************************************************/
@@ -1627,8 +1627,11 @@ OPTION_DEFAULT(uint_size, vmheap_size_wow64, 128 * 1024 * 1024,
                "capacity of virtual memory region reserved for unreachable heap "
                "on WoW64 processes")
 #endif
-/* We hardcode an address in the mmap_text region here, but verify via
- * in vmk_init().
+/* We hardcode a default vmcode base address, to which we add a random offset
+ * up to vm_max_offset.
+ * If -vm_base is set to 0, and -vm_base_near_app is off, we let the OS pick
+ * the base and do not apply any offset.
+ * For the default:
  * For Linux we start higher to avoid limiting the brk (i#766), but with our
  * new default -vm_size of 0x20000000 we want to stay below our various
  * preferred addresses of 0x7xxx0000 so we keep the base plus offset plus
diff --git a/core/unix/coredump.c b/core/unix/coredump.c
index eec54a1f526..3c26d45691b 100644
--- a/core/unix/coredump.c
+++ b/core/unix/coredump.c
@@ -37,19 +37,36 @@
 #define SECTION_HEADER_TABLE ".shstrtab"
 #define VVAR_SECTION "[vvar]"
 
+#ifdef X64
+#    define ELF_HEADER_TYPE Elf64_Ehdr
+#    define ELF_PROGRAM_HEADER_TYPE Elf64_Phdr
+#    define ELF_SECTION_HEADER_TYPE Elf64_Shdr
+#    define ELF_ADDR Elf64_Addr
+#    define ELF_WORD Elf64_Xword
+#    define ELF_OFF Elf64_Word
+#else
+#    define ELF_HEADER_TYPE Elf32_Ehdr
+#    define ELF_PROGRAM_HEADER_TYPE Elf32_Phdr
+#    define ELF_SECTION_HEADER_TYPE Elf32_Shdr
+#    define ELF_ADDR Elf32_Addr
+#    define ELF_WORD Elf32_Word
+#    define ELF_OFF Elf32_Word
+#endif
+
 DECLARE_CXTSWPROT_VAR(static mutex_t dump_core_lock, INIT_LOCK_FREE(dump_core_lock));
 
 /*
- * Returns true if the ELF header is written to the core dump file, false
- * otherwise.
+ * Writes an ELF header to the file. Returns true if the ELF header is written to the core
+ * dump file, false otherwise.
  */
 static bool
-write_elf_header(file_t elf_file, Elf64_Addr entry_point,
-                 Elf64_Off section_header_table_offset, Elf64_Half flags,
-                 Elf64_Half program_header_count, Elf64_Half section_header_count,
-                 Elf64_Half section_string_table_index)
+write_elf_header(DR_PARAM_IN file_t elf_file, DR_PARAM_IN ELF_ADDR entry_point,
+                 DR_PARAM_IN ELF_OFF section_header_table_offset,
+                 DR_PARAM_IN ELF_OFF flags, DR_PARAM_IN ELF_OFF program_header_count,
+                 DR_PARAM_IN ELF_OFF section_header_count,
+                 DR_PARAM_IN ELF_OFF section_string_table_index)
 {
-    Elf64_Ehdr ehdr;
+    ELF_HEADER_TYPE ehdr;
     ehdr.e_ident[0] = ELFMAG0;
     ehdr.e_ident[1] = ELFMAG1;
     ehdr.e_ident[2] = ELFMAG2;
@@ -74,29 +91,32 @@ write_elf_header(file_t elf_file, Elf64_Addr entry_point,
     ehdr.e_ehsize = sizeof(ehdr);
     /* Contains the size of a program header table entry. As explained below, this will
      * typically be 0x20 (32 bit) or 0x38 (64 bit). */
-    ehdr.e_phentsize = sizeof(Elf64_Phdr);
+    ehdr.e_phentsize = sizeof(ELF_PROGRAM_HEADER_TYPE);
     /* Contains the number of entries in the program header table. */
     ehdr.e_phnum = program_header_count;
-    ehdr.e_shentsize = sizeof(Elf64_Shdr);
+    ehdr.e_shentsize = sizeof(ELF_SECTION_HEADER_TYPE);
     /* Contains the number of entries in the section header table. */
     ehdr.e_shnum = section_header_count;
     /* Contains index of the section header table entry that contains the section names.
      */
     ehdr.e_shstrndx = section_string_table_index;
 
-    return os_write(elf_file, (void *)&ehdr, sizeof(Elf64_Ehdr)) == sizeof(Elf64_Ehdr);
+    return os_write(elf_file, (void *)&ehdr, sizeof(ELF_HEADER_TYPE)) ==
+        sizeof(ELF_HEADER_TYPE);
 }
 
 /*
- * Returns true if the program header is written to the core dump file, false
- * otherwise.
+ * Writes a program header to the file. Returns true if the program header is written to
+ * the core dump file, false otherwise.
  */
 static bool
-write_phdrs(file_t elf_file, Elf64_Word type, Elf64_Word flags, Elf64_Off offset,
-            Elf64_Addr virtual_address, Elf64_Addr physical_address,
-            Elf64_Xword file_size, Elf64_Xword memory_size, Elf64_Xword alignment)
+write_phdrs(DR_PARAM_IN file_t elf_file, DR_PARAM_IN ELF_WORD type,
+            DR_PARAM_IN ELF_WORD flags, DR_PARAM_IN ELF_OFF offset,
+            DR_PARAM_IN ELF_ADDR virtual_address, DR_PARAM_IN ELF_ADDR physical_address,
+            DR_PARAM_IN ELF_WORD file_size, DR_PARAM_IN ELF_WORD memory_size,
+            DR_PARAM_IN ELF_WORD alignment)
 {
-    Elf64_Phdr phdr;
+    ELF_PROGRAM_HEADER_TYPE phdr;
     phdr.p_type = type;              /* Segment type */
     phdr.p_flags = flags;            /* Segment flags */
     phdr.p_offset = offset;          /* Segment file offset */
@@ -106,20 +126,23 @@ write_phdrs(file_t elf_file, Elf64_Word type, Elf64_Word flags, Elf64_Off offset
     phdr.p_memsz = memory_size;      /* Segment size in memory */
     phdr.p_align = alignment;        /* Segment alignment */
 
-    return os_write(elf_file, (void *)&phdr, sizeof(Elf64_Phdr)) == sizeof(Elf64_Phdr);
+    return os_write(elf_file, (void *)&phdr, sizeof(ELF_PROGRAM_HEADER_TYPE)) ==
+        sizeof(ELF_PROGRAM_HEADER_TYPE);
 }
 
 /*
- * Returns true if the section header is written to the core dump file, false
- * otherwise.
+ * Write a section header to the file. Returns true if the section header is written to
+ * the core dump file, false otherwise.
  */
 static bool
-write_shdr(file_t elf_file, Elf64_Word string_table_offset, Elf64_Word type,
-           Elf64_Xword flags, Elf64_Addr virtual_address, Elf64_Off offset,
-           Elf64_Xword section_size, Elf64_Word link, Elf64_Word info,
-           Elf64_Xword alignment, Elf64_Xword entry_size)
+write_shdr(DR_PARAM_IN file_t elf_file, DR_PARAM_IN ELF_WORD string_table_offset,
+           DR_PARAM_IN ELF_WORD type, DR_PARAM_IN ELF_WORD flags,
+           DR_PARAM_IN ELF_ADDR virtual_address, DR_PARAM_IN ELF_OFF offset,
+           DR_PARAM_IN ELF_WORD section_size, DR_PARAM_IN ELF_WORD link,
+           DR_PARAM_IN ELF_WORD info, DR_PARAM_IN ELF_WORD alignment,
+           DR_PARAM_IN ELF_WORD entry_size)
 {
-    Elf64_Shdr shdr;
+    ELF_SECTION_HEADER_TYPE shdr;
     shdr.sh_name = string_table_offset; /* Section name (string tbl index) */
     shdr.sh_type = type;                /* Section type */
     shdr.sh_flags = flags;              /* Section flags */
@@ -131,11 +154,13 @@ write_shdr(file_t elf_file, Elf64_Word string_table_offset, Elf64_Word type,
     shdr.sh_addralign = alignment;      /* Section alignment */
     shdr.sh_entsize = entry_size;       /* Entry size if section holds table */
 
-    return os_write(elf_file, (void *)&shdr, sizeof(Elf64_Shdr)) == sizeof(Elf64_Shdr);
+    return os_write(elf_file, (void *)&shdr, sizeof(ELF_SECTION_HEADER_TYPE)) ==
+        sizeof(ELF_SECTION_HEADER_TYPE);
 }
 
 /*
- * Returns true if a core dump file is written, false otherwise.
+ * Writes a memory dump file in ELF format. Returns true if a core dump file is written,
+ * false otherwise.
  */
 static bool
 os_dump_core_internal(void)
@@ -148,9 +173,9 @@ os_dump_core_internal(void)
     char string_table[MAX_SECTION_NAME_BUFFER_SIZE];
     string_table[0] = '\0';
     string_table[1] = '\0';
-    int64 string_table_offset = 1;
-    int64 section_count = 0;
-    int64 seciion_data_size = 0;
+    ELF_ADDR string_table_offset = 1;
+    ELF_OFF section_count = 0;
+    ELF_OFF seciion_data_size = 0;
     memquery_iter_t iter;
     if (memquery_iterator_start(&iter, NULL, /*may_alloc=*/false)) {
         while (memquery_iterator_next(&iter)) {
@@ -158,10 +183,10 @@ os_dump_core_internal(void)
             if (iter.prot == MEMPROT_NONE || strcmp(iter.comment, VVAR_SECTION) == 0) {
                 continue;
             }
-            int64 offset = 0;
+            ELF_ADDR offset = 0;
             if (iter.comment != NULL && iter.comment[0] != '\0') {
-                offset = (int64)strhash_hash_lookup(GLOBAL_DCONTEXT, string_htable,
-                                                    iter.comment);
+                offset = (ELF_ADDR)strhash_hash_lookup(GLOBAL_DCONTEXT, string_htable,
+                                                       iter.comment);
                 if (offset == 0 &&
                     (string_table[1] == '\0' ||
                      d_r_strcmp(string_table, iter.comment) != 0)) {
@@ -205,8 +230,9 @@ os_dump_core_internal(void)
     }
 
     if (!write_elf_header(elf_file, /*entry_point=*/0,
-                          /*section_header_table_offset*/ sizeof(Elf64_Ehdr) +
-                              1 /*program_header_count*/ * sizeof(Elf64_Phdr) +
+                          /*section_header_table_offset*/ sizeof(ELF_HEADER_TYPE) +
+                              1 /*program_header_count*/ *
+                                  sizeof(ELF_PROGRAM_HEADER_TYPE) +
                               seciion_data_size,
                           /*flags=*/0,
                           /*program_header_count=*/1,
@@ -215,7 +241,7 @@ os_dump_core_internal(void)
         os_close(elf_file);
         return false;
     }
-    // TODO i#xxxx: Fill the program header with valid data.
+    // TODO i#7046: Fill the program header with valid data.
     if (!write_phdrs(elf_file, PT_NULL, PF_X, /*offset=*/0, /*virtual_address=*/0,
                      /*physical_address=*/0,
                      /*file_size=*/0, /*memory_size=*/0, /*alignment=*/0)) {
@@ -249,27 +275,27 @@ os_dump_core_internal(void)
     }
 
     if (memquery_iterator_start(&iter, NULL, /*may_alloc=*/false)) {
-        // TODO i#xxxx: Handle multiple program headers.
-        int64 file_offset =
-            sizeof(Elf64_Ehdr) + 1 /*program_header_count*/ * sizeof(Elf64_Phdr);
+        // TODO i#7046: Handle multiple program headers.
+        ELF_OFF file_offset = sizeof(ELF_HEADER_TYPE) +
+            1 /*program_header_count*/ * sizeof(ELF_PROGRAM_HEADER_TYPE);
         while (memquery_iterator_next(&iter)) {
             // Skip non-readable section.
             if (iter.prot == MEMPROT_NONE || strcmp(iter.comment, VVAR_SECTION) == 0) {
                 continue;
             }
-            Elf64_Xword flags = SHF_ALLOC | SHF_MERGE;
+            ELF_WORD flags = SHF_ALLOC | SHF_MERGE;
             if (iter.prot & PROT_WRITE) {
                 flags |= SHF_WRITE;
             }
-            int64 name_offset = 0;
+            ELF_ADDR name_offset = 0;
             if (iter.comment != NULL && iter.comment[0] != '\0') {
-                name_offset = (int64)strhash_hash_lookup(GLOBAL_DCONTEXT, string_htable,
-                                                         iter.comment);
+                name_offset = (ELF_ADDR)strhash_hash_lookup(GLOBAL_DCONTEXT,
+                                                            string_htable, iter.comment);
             }
             if (!write_shdr(elf_file, name_offset, SHT_PROGBITS, flags,
-                            (Elf64_Addr)iter.vm_start, file_offset,
+                            (ELF_ADDR)iter.vm_start, file_offset,
                             iter.vm_end - iter.vm_start, /*link=*/0,
-                            /*info=*/0, /*alignment=*/sizeof(Elf64_Xword),
+                            /*info=*/0, /*alignment=*/sizeof(ELF_WORD),
                             /*entry_size=*/0)) {
                 os_close(elf_file);
                 return false;
diff --git a/core/unix/os_exports.h b/core/unix/os_exports.h
index 26f1b6cd8e5..d66c67111b1 100644
--- a/core/unix/os_exports.h
+++ b/core/unix/os_exports.h
@@ -246,7 +246,8 @@ ushort
 os_get_app_tls_reg_offset(reg_id_t seg);
 void *
 os_get_app_tls_base(dcontext_t *dcontext, reg_id_t seg);
-bool os_dump_core_live(void);
+bool
+os_dump_core_live(void);
 
 #if defined(AARCHXX) || defined(RISCV64)
 bool
diff --git a/core/utils.c b/core/utils.c
index 50846eb6ae9..3217387cdf9 100644
--- a/core/utils.c
+++ b/core/utils.c
@@ -417,7 +417,8 @@ locks_not_closed()
                     cur_lock->rank ==
                         LOCK_RANK(options_lock)
                         /* This lock can be used parallel to detach cleanup. */
-                        IF_UNIX(|| cur_lock->rank == LOCK_RANK(detached_sigact_lock)))) {
+                        IF_UNIX(|| cur_lock->rank == LOCK_RANK(detached_sigact_lock))
+                            IF_LINUX(|| cur_lock->rank == LOCK_RANK(dump_core_lock)))) {
             /* i#1058: curiosities during exit re-acquire these locks. */
             ignored++;
         } else {
diff --git a/core/utils.h b/core/utils.h
index f78f5790a0f..8d8517a628a 100644
--- a/core/utils.h
+++ b/core/utils.h
@@ -618,7 +618,7 @@ enum {
 #    elif defined(MACOS)
     LOCK_RANK(memquery_backing_lock),
 #    endif
-#    ifdef WINDOWS
+#    if defined(WINDOWS) || defined(LINUX)
     LOCK_RANK(dump_core_lock),
 #    endif
 
diff --git a/ext/drx/scatter_gather_aarch64.c b/ext/drx/scatter_gather_aarch64.c
index 2a3ad603403..195196f1bc7 100644
--- a/ext/drx/scatter_gather_aarch64.c
+++ b/ext/drx/scatter_gather_aarch64.c
@@ -1235,7 +1235,8 @@ reserve_pred_register(instr_t *sg_instr, spill_slot_state_t *slot_state)
             break;
         }
     }
-    DR_ASSERT(slot_state->pred_slots[slot].kind == SLOT_KIND_UNUSED);
+    DR_ASSERT(slot < NUM_PRED_SLOTS &&
+              slot_state->pred_slots[slot].kind == SLOT_KIND_UNUSED);
 
     /* Some instructions require the predicate to be in the range p0 - p7. This includes
      * LASTB which we use to extract elements from the vector register.
@@ -1261,7 +1262,8 @@ reserve_vector_register(instr_t *sg_instr, spill_slot_state_t *slot_state)
             break;
         }
     }
-    DR_ASSERT(slot_state->vector_slots[slot].kind == SLOT_KIND_UNUSED);
+    DR_ASSERT(slot < NUM_VECTOR_SLOTS &&
+              slot_state->vector_slots[slot].kind == SLOT_KIND_UNUSED);
 
     reg_id_t min_reg = DR_REG_Z0;
     /* Skip over any registers that have already been allocated. */
@@ -1314,8 +1316,9 @@ unreserve_pred_register(void *drcontext, instrlist_t *bb, instr_t *where,
             break;
         }
     }
-    DR_ASSERT(slot_state->pred_slots[slot].kind == SLOT_KIND_SPILL);
-    DR_ASSERT(slot_state->pred_slots[slot].reg == scratch_pred);
+    DR_ASSERT(slot < NUM_PRED_SLOTS &&
+              slot_state->pred_slots[slot].kind == SLOT_KIND_SPILL);
+    DR_ASSERT(slot < NUM_PRED_SLOTS && slot_state->pred_slots[slot].reg == scratch_pred);
 
     unreserve_sve_register(drcontext, bb, where, scratch_gpr0, scratch_pred,
                            offsetof(per_thread_t, scratch_pred_spill_slots),
@@ -1337,7 +1340,8 @@ unreserve_vector_register(void *drcontext, instrlist_t *bb, instr_t *where,
             break;
         }
     }
-    DR_ASSERT(slot_state->vector_slots[slot].reg == scratch_vec);
+    DR_ASSERT(slot < NUM_VECTOR_SLOTS &&
+              slot_state->vector_slots[slot].reg == scratch_vec);
 
     unreserve_sve_register(drcontext, bb, where, scratch_gpr0, scratch_vec,
                            offsetof(per_thread_t, scratch_vector_spill_slots_aligned),
diff --git a/make/package.cmake b/make/package.cmake
index 67ae451db0e..d6585d8c343 100644
--- a/make/package.cmake
+++ b/make/package.cmake
@@ -1,5 +1,5 @@
 # **********************************************************
-# Copyright (c) 2011-2022 Google, Inc.    All rights reserved.
+# Copyright (c) 2011-2024 Google, Inc.    All rights reserved.
 # Copyright (c) 2009-2010 VMware, Inc.    All rights reserved.
 # **********************************************************
 
@@ -164,10 +164,15 @@ set(base_cache "
   BUILD_NUMBER:STRING=${arg_build}
   UNIQUE_BUILD_NUMBER:STRING=${arg_ubuild}
   BUILD_PACKAGE:BOOL=ON
-  AUTOMATED_TESTING:BOOL=ON
   ${arg_cacheappend}
   ")
 
+if (WIN32)
+  # TODO i#5767: Install a working zlib package on our Windows GA CI images.
+  set(base_cache "${base_cache}
+    DISABLE_ZLIB:BOOL=ON")
+endif()
+
 # version is optional
 if (arg_version)
   set(base_cache "${base_cache}
diff --git a/suite/tests/CMakeLists.txt b/suite/tests/CMakeLists.txt
index 577fe842833..5bb52650edc 100644
--- a/suite/tests/CMakeLists.txt
+++ b/suite/tests/CMakeLists.txt
@@ -1403,7 +1403,11 @@ function(torun test key source native standalone_dr dr_ops exe_ops added_out pas
   endif ()
 
   if (DEFINED ${key}_sudo)
-    set(rundr "sudo;${rundr}")
+    if (DEFINED ${key}_nodr)
+      set(exepath "sudo;${exepath}")
+    else ()
+      set(rundr "sudo;${rundr}")
+    endif ()
   endif ()
 
   if (is_runall)
@@ -2127,7 +2131,8 @@ if (AARCH64 AND UNIX AND ZLIB_FOUND)
     prefix_cmd_if_necessary(drcachesim_path ON ${drcachesim_path})
     torunonly_api(tool.drcacheoff.tlb_simulator_v2p "${drcachesim_path}"
       "offline-tlb_simulator_v2p" ""
-      "-indir;${locdir};-tool;TLB;-alt_module_dir;${srcdir};-module_file;${locdir}/raw/modules.log;-v2p_file;${srcdir}/v2p.textproto;-use_physical"
+      # Do not use core-sharded scheduling, so we'll have a deterministic result.
+      "-indir;${locdir};-tool;TLB;-alt_module_dir;${srcdir};-module_file;${locdir}/raw/modules.log;-v2p_file;${srcdir}/v2p.textproto;-use_physical;-no_core_sharded"
       OFF OFF)
     set(tool.drcacheoff.tlb_simulator_v2p_basedir
       "${PROJECT_SOURCE_DIR}/clients/drcachesim/tests")
@@ -3972,12 +3977,14 @@ if (BUILD_CLIENTS)
         set(tool.core_on_disk_rawtemp ON) # no preprocessor
 
         torunonly_simtool(core_on_disk_schedule ${ci_shared_app}
-          "-indir ${core_sharded_dir} -tool schedule_stats" "")
+          # Avoid the default core-sharded from re-scheduling the trace.
+          "-indir ${core_sharded_dir} -tool schedule_stats -no_core_sharded" "")
         set(tool.core_on_disk_schedule_rawtemp ON) # no preprocessor
 
         # Test -only_shards on core-sharded-on-disk traces.
         torunonly_simtool(only_shards ${ci_shared_app}
-          "-indir ${core_sharded_dir} -tool schedule_stats -only_shards 2,3" "")
+          # Avoid the default core-sharded from re-scheduling the trace.
+          "-indir ${core_sharded_dir} -tool schedule_stats -only_shards 2,3 -no_core_sharded" "")
         set(tool.core_on_disk_rawtemp ON) # no preprocessor
       endif ()
     endif ()
@@ -4096,10 +4103,10 @@ if (BUILD_CLIENTS)
       torunonly_drcacheoff(fork linux.fork "" "" "")
     endif ()
 
-    # Test reading a legacy pre-interleaved file.
+    # Test reading a legacy pre-interleaved file in thread-sharded mode.
     if (ZLIB_FOUND)
       torunonly_api(tool.drcacheoff.legacy "${drcachesim_path}" "offline-legacy.c" ""
-        "-infile;${PROJECT_SOURCE_DIR}/clients/drcachesim/tests/offline-legacy-trace.gz"
+        "-infile;${PROJECT_SOURCE_DIR}/clients/drcachesim/tests/offline-legacy-trace.gz;-no_core_sharded"
         OFF OFF)
       set(tool.drcacheoff.legacy_basedir
         "${PROJECT_SOURCE_DIR}/clients/drcachesim/tests")
@@ -4135,16 +4142,20 @@ if (BUILD_CLIENTS)
     # Test reading a trace in sharded snappy-compressed files.
     if (libsnappy)
       # with a parallel tool (basic_counts)
-      torunonly_api(tool.drcacheoff.snappy "${drcachesim_path}" "offline-snappy.c" ""
+      torunonly_api(tool.drcacheoff.snappy_parallel "${drcachesim_path}" "offline-snappy.c" ""
         "-indir;${PROJECT_SOURCE_DIR}/clients/drcachesim/tests/drmemtrace.chase-snappy.x64.tracedir;-tool;basic_counts"
         OFF OFF)
+      set(tool.drcacheoff.snappy_parallel_basedir
+        "${PROJECT_SOURCE_DIR}/clients/drcachesim/tests")
+      set(tool.drcacheoff.snappy_parallel_expectbase "offline-snappy")
 
-      # with a legacy serial tool (full simulator)
-      torunonly_api(tool.drcacheoff.snappy "${drcachesim_path}" "offline-snappy-serial.c" ""
-        "-indir;${PROJECT_SOURCE_DIR}/clients/drcachesim/tests/drmemtrace.chase-snappy.x64.tracedir"
+      # With a legacy serial tool (full simulator) in thread-sharded mode.
+      torunonly_api(tool.drcacheoff.snappy_serial "${drcachesim_path}" "offline-snappy-serial.c" ""
+        "-indir;${PROJECT_SOURCE_DIR}/clients/drcachesim/tests/drmemtrace.chase-snappy.x64.tracedir;-no_core_sharded"
         OFF OFF)
-      set(tool.drcacheoff.snappy_basedir
+      set(tool.drcacheoff.snappy_serial_basedir
         "${PROJECT_SOURCE_DIR}/clients/drcachesim/tests")
+      set(tool.drcacheoff.snappy_serial_expectbase "offline-snappy-serial")
     endif()
 
     if (UNIX) # Enable on Windows once i#1727 is fixed.
@@ -4437,15 +4448,16 @@ if (BUILD_CLIENTS)
         # Run with -trace_after_instrs to ensure we test the
         # drbbdup + rseq combo (i#5658, i#5659).
         "-trace_after_instrs 5K"
-        "@${test_mode_flag}@-test_mode_name@rseq_app" "")
+        # Run thread-sharded for the invariant checker.
+        "@${test_mode_flag}@-test_mode_name@rseq_app@-no_core_sharded" "")
       # Test filtering.
       torunonly_drcacheoff(rseq-filter linux.rseq
         "-trace_after_instrs 5K -L0_filter"
-        "@${test_mode_flag}@-test_mode_name@rseq_app" "")
+        "@${test_mode_flag}@-test_mode_name@rseq_app@-no_core_sharded" "")
       set(tool.drcacheoff.rseq-filter_expectbase "offline-rseq")
       torunonly_drcacheoff(rseq-dfilter linux.rseq
         "-trace_after_instrs 5K -L0D_filter"
-        "@${test_mode_flag}@-test_mode_name@rseq_app" "")
+        "@${test_mode_flag}@-test_mode_name@rseq_app@-no_core_sharded" "")
       set(tool.drcacheoff.rseq-dfilter_expectbase "offline-rseq")
     endif ()
 
@@ -4742,9 +4754,9 @@ if (BUILD_CLIENTS)
       # Run the record filter tool with a null filter.
       set(${testname}_postcmd2 "${CMAKE_COMMAND}@-E@make_directory@${outdir}")
       set(${testname}_postcmd3 ${launch_cmd})
-      # Run the analyzer on the result.
+      # Run the analyzer on the result.  Avoid double-core-sharding.
       set(${testname}_postcmd4
-        "${drcachesim_path}@-indir@${outdir}@-tool@${analyzer}")
+        "${drcachesim_path}@-indir@${outdir}@-tool@${analyzer}@-no_core_sharded")
     endmacro ()
 
     set(testname "tool.record_filter")
@@ -4790,6 +4802,18 @@ if (BUILD_CLIENTS)
           # We run basic_counts on the filtered trace to check that there are no function
           # related markers left.
           "basic_counts")
+
+        # Run modify_marker_value_filter to overwrite the values of
+        # TRACE_MARKER_TYPE_CPU_ID markers with (uintptr_t)-1, which represents an unknown
+        # CPU, and TRACE_MARKER_TYPE_PAGE_SIZE to 2k.
+        set(testname "tool.record_filter_modify_marker_value")
+        torun_record_filter("${testname}" ${kernel_xfer_app}
+          "record_filter_modify_marker_value"
+          "${drcachesim_path}@-simulator_type@record_filter@-filter_modify_marker_value@3,0xffffffffffffffff,18,2048@-indir@${testname}.${kernel_xfer_app}.*.dir/trace@-outdir@${testname}.filtered.dir"
+          # We run view on the filtered trace to check that the value of the first
+          # TRACE_MARKER_TYPE_CPU_ID is (uintptr_t)-1 and the value of
+          # TRACE_MARKER_TYPE_PAGE_SIZE is 2k.
+          "view")
     endif ()
 
     if (X86 AND X64 AND UNIX AND NOT APPLE)
@@ -4868,7 +4892,7 @@ if (BUILD_CLIENTS)
       file(MAKE_DIRECTORY ${outdir})
       torunonly_api(tool.record_filter_as_traced "${drcachesim_path}"
         "record_filter_as_traced"
-        "" "-tool;schedule_stats;-indir;${outdir}" OFF OFF)
+        "" "-tool;schedule_stats;-no_core_sharded;-indir;${outdir}" OFF OFF)
       set(tool.record_filter_as_traced_runcmp "${CMAKE_CURRENT_SOURCE_DIR}/runmulti.cmake")
       set(tool.record_filter_as_traced_precmd
         "${drcachesim_path}@-tool@record_filter@-cpu_schedule_file@${sched_file}@-core_sharded@-cores@7@-indir@${trace_dir}@-outdir@${outdir}")
@@ -4885,7 +4909,7 @@ if (BUILD_CLIENTS)
       file(MAKE_DIRECTORY ${outdir})
       torunonly_api(tool.record_filter_start_idle "${drcachesim_path}"
         "record_filter_start_idle"
-        "" "-tool;schedule_stats;-indir;${outdir}" OFF OFF)
+        "" "-tool;schedule_stats;-indir;${outdir};-no_core_sharded" OFF OFF)
       set(tool.record_filter_start_idle_runcmp "${CMAKE_CURRENT_SOURCE_DIR}/runmulti.cmake")
       set(tool.record_filter_start_idle_precmd
         "${drcachesim_path}@-tool@record_filter@-cpu_schedule_file@${sched_file}@-core_sharded@-cores@4@-indir@${trace_dir}@-outdir@${outdir}")
@@ -5040,6 +5064,13 @@ if (BUILD_CLIENTS)
       # kernel tracing code is not being intentionally tested.
       torunonly_drcachesim(kernel-skip-kcore_SUDO ${ci_shared_app}
                            "-offline -enable_kernel_tracing -skip_kcore_dump" "")
+
+      if (LINUX)
+        set(tool.drcacheoff.burst_syscall_pt_SUDO_nodr ON)
+        set(tool.drcacheoff.burst_syscall_pt_SUDO_sudo ON)
+        torunonly_drcacheoff(burst_syscall_pt_SUDO tool.drcacheoff.burst_syscall_pt_SUDO
+          "" "@-tool@syscall_mix" "")
+      endif ()
     endif (BUILD_PT_TRACER AND BUILD_PT_POST_PROCESSOR)
   endif (proc_supports_pt)