diff --git a/clients/drcachesim/scheduler/scheduler_dynamic.cpp b/clients/drcachesim/scheduler/scheduler_dynamic.cpp
index 8f8dad07eb..979c2c6691 100644
--- a/clients/drcachesim/scheduler/scheduler_dynamic.cpp
+++ b/clients/drcachesim/scheduler/scheduler_dynamic.cpp
@@ -458,7 +458,8 @@ scheduler_dynamic_tmpl_t<RecordType, ReaderType>::check_for_input_switch(
         // boundaries so we live with those being before the switch.
         // XXX: Once we insert kernel traces, we may have to try harder
         // to stop before the post-syscall records.
-        if (this->record_type_is_instr_boundary(record, outputs_[output].last_record) &&
+        if (this->record_type_is_instr_boundary(record,
+                                                outputs_[output].last_record.record) &&
             // We want to delay the context switch until after the injected syscall trace.
             !outputs_[output].in_syscall_code) {
             if (input->switch_to_input != sched_type_t::INVALID_INPUT_ORDINAL) {
@@ -507,7 +508,8 @@ scheduler_dynamic_tmpl_t<RecordType, ReaderType>::check_for_input_switch(
         this->process_marker(*input, output, marker_type, marker_value);
     }
     if (options_.quantum_unit == sched_type_t::QUANTUM_INSTRUCTIONS &&
-        this->record_type_is_instr_boundary(record, outputs_[output].last_record) &&
+        this->record_type_is_instr_boundary(record,
+                                            outputs_[output].last_record.record) &&
         !outputs_[output].in_context_switch_code) {
         ++input->instrs_in_quantum;
         if (input->instrs_in_quantum > options_.quantum_duration_instrs) {
@@ -546,7 +548,8 @@ scheduler_dynamic_tmpl_t<RecordType, ReaderType>::check_for_input_switch(
             // We only switch on instruction boundaries.  We could possibly switch
             // in between (e.g., scatter/gather long sequence of reads/writes) by
             // setting input->switching_pre_instruction.
-            this->record_type_is_instr_boundary(record, outputs_[output].last_record)) {
+            this->record_type_is_instr_boundary(record,
+                                                outputs_[output].last_record.record)) {
             if (outputs_[output].in_syscall_code) {
                 // XXX: Maybe this should be printed only once per-syscall-instance to
                 // reduce log spam.
diff --git a/clients/drcachesim/scheduler/scheduler_impl.cpp b/clients/drcachesim/scheduler/scheduler_impl.cpp
index eeda7fbe1b..a077f2325c 100644
--- a/clients/drcachesim/scheduler/scheduler_impl.cpp
+++ b/clients/drcachesim/scheduler/scheduler_impl.cpp
@@ -95,6 +95,9 @@ typedef dynamorio::drmemtrace::record_file_reader_t<std::ifstream>
     default_record_file_reader_t;
 #endif
 
+static constexpr bool IS_REAL = true;
+static constexpr bool IS_SYNTHETIC = false;
+
 std::string
 replay_file_checker_t::check(archive_istream_t *infile)
 {
@@ -564,8 +567,8 @@ scheduler_impl_tmpl_t<trace_entry_t, record_reader_t>::insert_switch_tid_pid(
     tid.size = 0;
     tid.addr = static_cast<addr_t>(input.tid);
 
-    input.queue.push_front(pid);
-    input.queue.push_front(tid);
+    input.queue.emplace_front(pid, IS_SYNTHETIC);
+    input.queue.emplace_front(tid, IS_SYNTHETIC);
 }
 
 /***************************************************************************
@@ -656,7 +659,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::~scheduler_impl_tmpl_t()
                outputs_[i].stats[memtrace_stream_t::SCHED_STAT_RUNQUEUE_STEALS]);
         VPRINT(this, 1, "  %-35s: %9" PRId64 "\n", "Runqueue rebalances",
                outputs_[i].stats[memtrace_stream_t::SCHED_STAT_RUNQUEUE_REBALANCES]);
-        VPRINT(this, 1, "  %-35s: %9" PRId64 "\n", "Ouput limits hit",
+        VPRINT(this, 1, "  %-35s: %9" PRId64 "\n", "Output limits hit",
                outputs_[i].stats[memtrace_stream_t::SCHED_STAT_HIT_OUTPUT_LIMIT]);
 #ifndef NDEBUG
         VPRINT(this, 1, "  %-35s: %9" PRId64 "\n", "Runqueue lock acquired",
@@ -664,6 +667,10 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::~scheduler_impl_tmpl_t()
         VPRINT(this, 1, "  %-35s: %9" PRId64 "\n", "Runqueue lock contended",
                outputs_[i].ready_queue.lock->get_count_contended());
 #endif
+        VPRINT(
+            this, 1, "  %-35s: %9" PRId64 "\n", "Kernel switch sequence injections",
+            outputs_[i]
+                .stats[memtrace_stream_t::SCHED_STAT_KERNEL_SWITCH_SEQUENCE_INJECTIONS]);
     }
 }
 
@@ -850,7 +857,8 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::init(
                 ? spec_type_t::USE_NOPS
                 // TODO i#5843: Add more flags for other options.
                 : spec_type_t::LAST_FROM_TRACE,
-            static_cast<int>(get_time_micros()), create_invalid_record(), verbosity_);
+            static_cast<int>(get_time_micros()),
+            cached_record_t(create_invalid_record(), IS_SYNTHETIC), verbosity_);
         if (options_.single_lockstep_output)
             outputs_.back().stream = global_stream_.get();
         if (options_.schedule_record_ostream != nullptr) {
@@ -1523,8 +1531,9 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::get_initial_input_content(
             // Maybe we should disallow it so we don't need checks like this?
             options_.mapping != sched_type_t::MAP_AS_PREVIOUSLY) {
             RecordType record = create_invalid_record();
+            bool is_record_real = false;
             stream_status_t res =
-                advance_region_of_interest(/*output=*/-1, record, input);
+                advance_region_of_interest(/*output=*/-1, record, input, is_record_real);
             if (res == sched_type_t::STATUS_SKIPPED) {
                 input.next_timestamp =
                     static_cast<uintptr_t>(input.reader->get_last_timestamp());
@@ -1548,7 +1557,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::get_initial_input_content(
             // the non-consuming queue loop vs the consuming and queue-pushback
             // reader loop.
             for (const auto &record : input.queue) {
-                if (!process_next_initial_record(input, record, found_filetype,
+                if (!process_next_initial_record(input, record.record, found_filetype,
                                                  found_timestamp))
                     break;
             }
@@ -1592,7 +1601,8 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::get_initial_input_content(
                 // we skip (see skip_instructions()).  Thus, we abort with an error.
                 if (record_type_is_instr(record))
                     break;
-                input.queue.push_back(record);
+                input.queue.emplace_back(record, IS_REAL);
+                ++input.real_records_in_queue;
                 ++(*input.reader);
             }
         }
@@ -1632,7 +1642,8 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::open_reader(
         RecordType record = **reader;
         if (record_type_has_tid(record, tid))
             break;
-        input.queue.push_back(record);
+        input.queue.emplace_back(record, IS_REAL);
+        ++input.real_records_in_queue;
         ++(*reader);
     }
     if (tid == INVALID_THREAD_ID) {
@@ -1821,10 +1832,14 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::get_input_record_ordinal(
         return 0;
     uint64_t ord = inputs_[index].reader->get_record_ordinal();
     if (get_instr_ordinal(inputs_[index]) == 0) {
+        uint64_t adjust =
+            inputs_[index].cur_from_queue && inputs_[index].is_cur_record_real ? 1 : 0;
+        adjust += inputs_[index].real_records_in_queue;
+        assert(ord >= adjust);
         // Account for get_initial_input_content() readahead for filetype/timestamp.
         // If this gets any more complex, the scheduler stream should track its
         // own counts for every input and just ignore the input stream's tracking.
-        ord -= inputs_[index].queue.size() + (inputs_[index].cur_from_queue ? 1 : 0);
+        ord -= adjust;
     }
     return ord;
 }
@@ -1852,7 +1867,8 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::get_input_first_timestamp(
         return 0;
     uint64_t res = inputs_[index].reader->get_first_timestamp();
     if (get_instr_ordinal(inputs_[index]) == 0 &&
-        (!inputs_[index].queue.empty() || inputs_[index].cur_from_queue)) {
+        (inputs_[index].real_records_in_queue > 0 ||
+         (inputs_[index].cur_from_queue && inputs_[index].is_cur_record_real))) {
         // Account for get_initial_input_content() readahead for filetype/timestamp.
         res = 0;
     }
@@ -1871,7 +1887,8 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::get_input_last_timestamp(
         return 0;
     uint64_t res = inputs_[index].reader->get_last_timestamp();
     if (get_instr_ordinal(inputs_[index]) == 0 &&
-        (!inputs_[index].queue.empty() || inputs_[index].cur_from_queue)) {
+        (inputs_[index].real_records_in_queue > 0 ||
+         (inputs_[index].cur_from_queue && inputs_[index].is_cur_record_real))) {
         // Account for get_initial_input_content() readahead for filetype/timestamp.
         res = 0;
     }
@@ -1881,7 +1898,8 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::get_input_last_timestamp(
 template <typename RecordType, typename ReaderType>
 typename scheduler_tmpl_t<RecordType, ReaderType>::stream_status_t
 scheduler_impl_tmpl_t<RecordType, ReaderType>::advance_region_of_interest(
-    output_ordinal_t output, RecordType &record, input_info_t &input)
+    output_ordinal_t output, RecordType &record, input_info_t &input,
+    bool &is_record_real)
 {
     assert(input.lock->owned_by_cur_thread());
     uint64_t cur_instr = get_instr_ordinal(input);
@@ -1913,7 +1931,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::advance_region_of_interest(
                     if (status != sched_type_t::STATUS_OK)
                         return status;
                 }
-                input.queue.push_back(create_thread_exit(input.tid));
+                input.queue.push_back({ create_thread_exit(input.tid), IS_SYNTHETIC });
                 stream_status_t status = mark_input_eof(input);
                 // For early EOF we still need our synthetic exit so do not return it yet.
                 if (status != sched_type_t::STATUS_OK &&
@@ -1932,8 +1950,9 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::advance_region_of_interest(
         if (input.cur_region > 0) {
             VPRINT(this, 3, "skip_instructions input=%d: inserting separator marker\n",
                    input.index);
-            input.queue.push_back(record);
+            input.queue.emplace_back(record, is_record_real);
             record = create_region_separator_marker(input.tid, input.cur_region);
+            is_record_real = false;
         }
         return sched_type_t::STATUS_OK;
     }
@@ -2011,11 +2030,12 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::clear_input_queue(input_info_t &i
     int i = 0;
     while (!input.queue.empty()) {
         assert(i == 0 ||
-               (!record_type_is_instr(input.queue.front()) &&
-                !record_type_is_encoding(input.queue.front())));
+               (!record_type_is_instr(input.queue.front().record) &&
+                !record_type_is_encoding(input.queue.front().record)));
         ++i;
         input.queue.pop_front();
     }
+    input.real_records_in_queue = 0;
 }
 
 template <typename RecordType, typename ReaderType>
@@ -2032,8 +2052,8 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::skip_instructions(input_info_t &i
     // For a skip of 0 we still need to clear non-instrs from the queue, but
     // should not have an instr in there.
     assert(skip_amount > 0 || input.queue.empty() ||
-           (!record_type_is_instr(input.queue.front()) &&
-            !record_type_is_encoding(input.queue.front())));
+           (!record_type_is_instr(input.queue.front().record) &&
+            !record_type_is_encoding(input.queue.front().record)));
     clear_input_queue(input);
     input.reader->skip_instructions(skip_amount);
     VPRINT(this, 3, "skip_instructions: input=%d amount=%" PRIu64 "\n", input.index,
@@ -2072,7 +2092,8 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::skip_instructions(input_info_t &i
         VPRINT(this, 3, "skip_instructions input=%d: inserting separator marker\n",
                input.index);
         input.queue.push_back(
-            create_region_separator_marker(input.tid, input.cur_region));
+            { create_region_separator_marker(input.tid, input.cur_region),
+              IS_SYNTHETIC });
     }
     return sched_type_t::STATUS_SKIPPED;
 }
@@ -2446,7 +2467,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::on_context_switch(
          --i) {
         RecordType record = switch_sequence_[switch_type][i];
         record_type_set_tid(record, inputs_[new_input].tid);
-        inputs_[new_input].queue.emplace_front(record);
+        inputs_[new_input].queue.emplace_front(record, IS_SYNTHETIC);
     }
     VPRINT(this, 3, "Inserted %zu switch for type %d from %d.%d to %d.%d\n",
            switch_sequence_[switch_type].size(), switch_type,
@@ -2536,6 +2557,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t outp
     }
     while (true) {
         input->cur_from_queue = false;
+        input->is_cur_record_real = false;
         if (input->needs_init) {
             // We pay the cost of this conditional to support ipc_reader_t::init() which
             // blocks and must be called right before reading its first record.
@@ -2547,9 +2569,12 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t outp
             input->needs_init = false;
         }
         if (!input->queue.empty()) {
-            record = input->queue.front();
-            input->queue.pop_front();
+            record = input->queue.front().record;
             input->cur_from_queue = true;
+            input->is_cur_record_real = input->queue.front().is_real;
+            input->queue.pop_front();
+            if (input->is_cur_record_real)
+                --input->real_records_in_queue;
         } else {
             // We again have a flag check because reader_t::init() does an initial ++
             // and so we want to skip that on the first record but perform a ++ prior
@@ -2582,12 +2607,17 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t outp
                 continue;
             } else {
                 record = **input->reader;
+                input->is_cur_record_real = true;
             }
         }
         VPRINT(this, 5,
                "next_record[%d]: candidate record from %d (@%" PRId64 "): ", output,
                input->index, get_instr_ordinal(*input));
-        if (input->instrs_pre_read > 0 && record_type_is_instr(record))
+        // FIXME: This is likely too premature; we should either move it to later,
+        // or undo the decrement for cases we don't actually end up returning the
+        // pre-read instruction to the caller.
+        if (input->instrs_pre_read > 0 && input->is_cur_record_real &&
+            record_type_is_instr(record))
             --input->instrs_pre_read;
         VDO(this, 5, print_record(record););
         bool need_new_input = false;
@@ -2605,7 +2635,9 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t outp
             // We have to put the candidate record in the queue before we release
             // the lock since another output may grab this input.
             VPRINT(this, 5, "next_record[%d]: queuing candidate record\n", output);
-            input->queue.push_back(record);
+            input->queue.emplace_back(record, input->is_cur_record_real);
+            if (input->is_cur_record_real)
+                ++input->real_records_in_queue;
             lock.unlock();
             res = pick_next_input(output, blocked_time);
             if (res != sched_type_t::STATUS_OK && res != sched_type_t::STATUS_WAIT &&
@@ -2629,8 +2661,8 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t outp
                 // we've already reset to 0.
                 if (!preempt && options_.mapping == sched_type_t::MAP_TO_ANY_OUTPUT) {
                     if (options_.quantum_unit == sched_type_t::QUANTUM_INSTRUCTIONS &&
-                        record_type_is_instr_boundary(record,
-                                                      outputs_[output].last_record)) {
+                        record_type_is_instr_boundary(
+                            record, outputs_[output].last_record.record)) {
                         assert(inputs_[prev_input].instrs_in_quantum > 0);
                         --inputs_[prev_input].instrs_in_quantum;
                     } else if (options_.quantum_unit == sched_type_t::QUANTUM_TIME) {
@@ -2649,7 +2681,10 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t outp
                 lock.lock();
                 if (res != sched_type_t::STATUS_SKIPPED) {
                     // Get our candidate record back.
-                    record = input->queue.back();
+                    record = input->queue.back().record;
+                    input->is_cur_record_real = input->queue.back().is_real;
+                    if (input->is_cur_record_real)
+                        --input->real_records_in_queue;
                     input->queue.pop_back();
                 }
             }
@@ -2662,7 +2697,8 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t outp
         if (input->needs_roi && options_.mapping != sched_type_t::MAP_AS_PREVIOUSLY &&
             !input->regions_of_interest.empty()) {
             input_ordinal_t prev_input = input->index;
-            res = advance_region_of_interest(output, record, *input);
+            res = advance_region_of_interest(output, record, *input,
+                                             input->is_cur_record_real);
             if (res == sched_type_t::STATUS_SKIPPED) {
                 // We need either the queue or to re-de-ref the reader so we loop,
                 // but we do not want to come back here.
@@ -2689,7 +2725,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t outp
     update_next_record(output, record);
     VDO(this, 4, print_record(record););
 
-    outputs_[output].last_record = record;
+    outputs_[output].last_record = { record, input->is_cur_record_real };
     record_type_has_tid(record, input->last_record_tid);
     record_type_has_pid(record, input->pid);
     return sched_type_t::STATUS_OK;
@@ -2727,6 +2763,9 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::update_next_record(output_ordinal
             VPRINT(this, 2, "output %d base timestamp = %zu\n", output,
                    outputs_[output].base_timestamp);
         }
+        // FIXME: When USE_INPUT_ORDINALS is enabled, this returns the input-local
+        // instruction ordinal (which not only is not global, but also counts the
+        // read-ahead instructions).
         uint64_t instr_ord = outputs_[output].stream->get_instruction_ordinal();
         uint64_t idle_count = outputs_[output].idle_count;
         uintptr_t new_time = static_cast<uintptr_t>(
@@ -2756,11 +2795,11 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::unread_last_record(output_ordinal
                                                                   input_info_t *&input)
 {
     auto &outinfo = outputs_[output];
-    if (record_type_is_invalid(outinfo.last_record))
+    if (record_type_is_invalid(outinfo.last_record.record))
         return sched_type_t::STATUS_INVALID;
     if (!outinfo.speculation_stack.empty())
         return sched_type_t::STATUS_INVALID;
-    record = outinfo.last_record;
+    record = outinfo.last_record.record;
     input = &inputs_[outinfo.cur_input];
     std::lock_guard<mutex_dbg_owned> lock(*input->lock);
     VPRINT(this, 4, "next_record[%d]: unreading last record, from %d\n", output,
@@ -2772,7 +2811,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::unread_last_record(output_ordinal
     if (options_.quantum_unit == sched_type_t::QUANTUM_INSTRUCTIONS &&
         record_type_is_instr(record))
         --input->instrs_in_quantum;
-    outinfo.last_record = create_invalid_record();
+    outinfo.last_record = { create_invalid_record(), IS_SYNTHETIC };
     return sched_type_t::STATUS_OK;
 }
 
@@ -2784,7 +2823,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::start_speculation(
     auto &outinfo = outputs_[output];
     if (outinfo.speculation_stack.empty()) {
         if (queue_current_record) {
-            if (record_type_is_invalid(outinfo.last_record))
+            if (record_type_is_invalid(outinfo.last_record.record))
                 return sched_type_t::STATUS_INVALID;
             inputs_[outinfo.cur_input].queue.push_back(outinfo.last_record);
         }
diff --git a/clients/drcachesim/scheduler/scheduler_impl.h b/clients/drcachesim/scheduler/scheduler_impl.h
index 9fa0d0fffb..6e886f456f 100644
--- a/clients/drcachesim/scheduler/scheduler_impl.h
+++ b/clients/drcachesim/scheduler/scheduler_impl.h
@@ -159,6 +159,15 @@ template <typename RecordType, typename ReaderType> class scheduler_impl_tmpl_t
 protected:
     typedef speculator_tmpl_t<RecordType> spec_type_t;
 
+    struct cached_record_t {
+        cached_record_t(RecordType record, bool is_real)
+            : record(record)
+            , is_real(is_real)
+        {
+        }
+        RecordType record;
+        bool is_real = false;
+    };
     struct input_info_t {
         input_info_t()
             : lock(new mutex_dbg_owned)
@@ -195,8 +204,11 @@ template <typename RecordType, typename ReaderType> class scheduler_impl_tmpl_t
         // If non-empty these records should be returned before incrementing the reader.
         // This is used for read-ahead and inserting synthetic records.
         // We use a deque so we can iterate over it.
-        std::deque<RecordType> queue;
+        // Remember to adjust real_records_in_queue when adding or removing from this.
+        std::deque<cached_record_t> queue;
+        uint64_t real_records_in_queue = 0;
         bool cur_from_queue;
+        bool is_cur_record_real;
         std::set<output_ordinal_t> binding;
         int priority = 0;
         std::vector<range_t> regions_of_interest;
@@ -423,7 +435,7 @@ template <typename RecordType, typename ReaderType> class scheduler_impl_tmpl_t
         output_info_t(scheduler_impl_tmpl_t<RecordType, ReaderType> *scheduler_impl,
                       output_ordinal_t ordinal,
                       typename spec_type_t::speculator_flags_t speculator_flags,
-                      int rand_seed, RecordType last_record_init, int verbosity = 0)
+                      int rand_seed, cached_record_t last_record_init, int verbosity = 0)
             : self_stream(scheduler_impl, ordinal, verbosity)
             , stream(&self_stream)
             , ready_queue(rand_seed)
@@ -467,7 +479,7 @@ template <typename RecordType, typename ReaderType> class scheduler_impl_tmpl_t
         // while this field holds the instruction's start PC.  The use case is for
         // queueing a read-ahead instruction record for start_speculation().
         addr_t prev_speculate_pc = 0;
-        RecordType last_record; // Set to TRACE_TYPE_INVALID in constructor.
+        cached_record_t last_record; // Set to TRACE_TYPE_INVALID in constructor.
         // A list of schedule segments. During replay, this is read by other threads,
         // but it is only written at init time.
         std::vector<schedule_record_t> record;
@@ -687,7 +699,7 @@ template <typename RecordType, typename ReaderType> class scheduler_impl_tmpl_t
     // If STATUS_SKIPPED or STATUS_STOLE is returned, a new next record needs to be read.
     stream_status_t
     advance_region_of_interest(output_ordinal_t output, RecordType &record,
-                               input_info_t &input);
+                               input_info_t &input, bool &is_record_real);
 
     // Discards the contents of the input queue.  Meant to be used when skipping
     // input records.
diff --git a/clients/drcachesim/scheduler/scheduler_replay.cpp b/clients/drcachesim/scheduler/scheduler_replay.cpp
index a79f037cfe..86bcf1994c 100644
--- a/clients/drcachesim/scheduler/scheduler_replay.cpp
+++ b/clients/drcachesim/scheduler/scheduler_replay.cpp
@@ -425,7 +425,8 @@ scheduler_replay_tmpl_t<RecordType, ReaderType>::pick_next_input_for_mode(
         // a synthetic thread exit record.  We need to first throw out the
         // queued candidate record, if any.
         this->clear_input_queue(inputs_[index]);
-        inputs_[index].queue.push_back(this->create_thread_exit(inputs_[index].tid));
+        inputs_[index].queue.emplace_back(this->create_thread_exit(inputs_[index].tid),
+                                          /*is_real=*/false);
         VPRINT(this, 2, "early end for input %d\n", index);
         // We're done with this entry but we need the queued record to be read,
         // so we do not move past the entry.
diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp
index 0b134488f8..ee2721d43a 100644
--- a/clients/drcachesim/tests/scheduler_unit_tests.cpp
+++ b/clients/drcachesim/tests/scheduler_unit_tests.cpp
@@ -5770,9 +5770,11 @@ test_unscheduled()
 }
 
 static void
-test_kernel_switch_sequences()
+test_kernel_switch_sequences(bool use_input_ordinals)
 {
-    std::cerr << "\n----------------\nTesting kernel switch sequences\n";
+    std::cerr
+        << "\n----------------\nTesting kernel switch sequences for use_input_ordinals: "
+        << use_input_ordinals << "\n";
     static constexpr memref_tid_t TID_IN_SWITCHES = 1;
     static constexpr addr_t PROCESS_SWITCH_PC_START = 0xfeed101;
     static constexpr addr_t THREAD_SWITCH_PC_START = 0xcafe101;
@@ -5838,9 +5840,18 @@ test_kernel_switch_sequences()
         }
         sched_inputs.emplace_back(std::move(readers));
     }
+    dynamorio::drmemtrace::scheduler_tmpl_t<
+        dynamorio::drmemtrace::_memref_t,
+        dynamorio::drmemtrace::reader_t>::scheduler_flags_t flags =
+        scheduler_t::SCHEDULER_DEFAULTS;
+    if (use_input_ordinals) {
+        flags = static_cast<dynamorio::drmemtrace::scheduler_tmpl_t<
+            dynamorio::drmemtrace::_memref_t,
+            dynamorio::drmemtrace::reader_t>::scheduler_flags_t>(
+            flags | scheduler_t::SCHEDULER_USE_INPUT_ORDINALS);
+    }
     scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
-                                               scheduler_t::DEPENDENCY_TIMESTAMPS,
-                                               scheduler_t::SCHEDULER_DEFAULTS,
+                                               scheduler_t::DEPENDENCY_TIMESTAMPS, flags,
                                                /*verbosity=*/3);
     sched_ops.quantum_duration_instrs = INSTR_QUANTUM;
     sched_ops.kernel_switch_reader = std::move(switch_reader);
@@ -5865,6 +5876,7 @@ test_kernel_switch_sequences()
     std::vector<bool> in_switch(NUM_OUTPUTS, false);
     std::vector<uint64> prev_in_ord(NUM_OUTPUTS, 0);
     std::vector<uint64> prev_out_ord(NUM_OUTPUTS, 0);
+    std::vector<uint64> switch_seq_count(NUM_OUTPUTS, 0);
     while (num_eof < NUM_OUTPUTS) {
         for (int i = 0; i < NUM_OUTPUTS; i++) {
             if (eof[i])
@@ -5888,9 +5900,12 @@ test_kernel_switch_sequences()
                 sched_as_string[i] +=
                     'A' + static_cast<char>(memref.instr.tid - TID_BASE);
             }
+            bool now_switch = false;
             if (memref.marker.type == TRACE_TYPE_MARKER &&
-                memref.marker.marker_type == TRACE_MARKER_TYPE_CONTEXT_SWITCH_START)
+                memref.marker.marker_type == TRACE_MARKER_TYPE_CONTEXT_SWITCH_START) {
+                now_switch = true;
                 in_switch[i] = true;
+            }
             if (in_switch[i]) {
                 // Test that switch code is marked synthetic.
                 assert(outputs[i]->is_record_synthetic());
@@ -5899,10 +5914,14 @@ test_kernel_switch_sequences()
                 assert(outputs[i]->get_input_interface()->get_record_ordinal() ==
                            prev_in_ord[i] ||
                        // Won't match if we just switched inputs.
-                       (memref.marker.type == TRACE_TYPE_MARKER &&
-                        memref.marker.marker_type ==
-                            TRACE_MARKER_TYPE_CONTEXT_SWITCH_START));
-                assert(outputs[i]->get_record_ordinal() > prev_out_ord[i]);
+                       now_switch);
+                if (use_input_ordinals) {
+                    assert(outputs[i]->get_record_ordinal() == prev_out_ord[i] ||
+                           // Won't match if we just switched inputs.
+                           now_switch);
+                } else {
+                    assert(outputs[i]->get_record_ordinal() > prev_out_ord[i]);
+                }
             } else
                 assert(!outputs[i]->is_record_synthetic());
             if (type_is_instr(memref.instr.type))
@@ -5912,7 +5931,9 @@ test_kernel_switch_sequences()
                 case TRACE_MARKER_TYPE_VERSION: sched_as_string[i] += 'v'; break;
                 case TRACE_MARKER_TYPE_TIMESTAMP: sched_as_string[i] += '0'; break;
                 case TRACE_MARKER_TYPE_CONTEXT_SWITCH_END:
+                    assert(in_switch[i]);
                     in_switch[i] = false;
+                    ++switch_seq_count[i];
                     ANNOTATE_FALLTHROUGH;
                 case TRACE_MARKER_TYPE_CONTEXT_SWITCH_START:
                     if (memref.marker.marker_value == scheduler_t::SWITCH_PROCESS)
@@ -5930,6 +5951,12 @@ test_kernel_switch_sequences()
             prev_out_ord[i] = outputs[i]->get_record_ordinal();
         }
     }
+    for (int i = 0; i < NUM_OUTPUTS; i++) {
+        assert(switch_seq_count[i] > 0);
+        assert(switch_seq_count[i] ==
+               static_cast<uint64>(outputs[i]->get_schedule_statistic(
+                   memtrace_stream_t::SCHED_STAT_KERNEL_SWITCH_SEQUENCE_INJECTIONS)));
+    }
     // Check the high-level strings.
     for (int i = 0; i < NUM_OUTPUTS; i++) {
         std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n";
@@ -6730,7 +6757,8 @@ test_main(int argc, const char *argv[])
     test_inactive();
     test_direct_switch();
     test_unscheduled();
-    test_kernel_switch_sequences();
+    test_kernel_switch_sequences(/*use_input_ordinals=*/true);
+    test_kernel_switch_sequences(/*use_input_ordinals=*/false);
     test_random_schedule();
     test_record_scheduler();
     test_rebalancing();