Skip to content

Commit 3a6bb82

Browse files
committed
see
1 parent b65f5fe commit 3a6bb82

5 files changed

Lines changed: 75 additions & 56 deletions

File tree

boot/freeldr/freeldr/lib/mm/meminit.c

Lines changed: 2 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -284,22 +284,7 @@ MmCheckFreeldrImageFile(VOID)
284284
(FileHeader->NumberOfSymbols != 0) || // "" ""
285285
(FileHeader->SizeOfOptionalHeader != sizeof(IMAGE_OPTIONAL_HEADER)))
286286
{
287-
ERR("FreeLdr FileHeader is invalid.\n");
288-
FrLdrBugCheckWithMessage(
289-
FREELDR_IMAGE_CORRUPTION,
290-
__FILE__,
291-
__LINE__,
292-
"FreeLdr FileHeader is invalid.\n"
293-
"Machine == 0x%lx, expected 0x%lx\n"
294-
"NumberOfSections == 0x%lx, expected 0x%lx\n"
295-
"PointerToSymbolTable == 0x%lx, expected 0\n"
296-
"NumberOfSymbols == 0x%lx, expected 0\n"
297-
"SizeOfOptionalHeader == 0x%lx, expected 0x%lx\n",
298-
FileHeader->Machine, IMAGE_FILE_MACHINE_NATIVE,
299-
FileHeader->NumberOfSections, FREELDR_SECTION_COUNT,
300-
FileHeader->PointerToSymbolTable,
301-
FileHeader->NumberOfSymbols,
302-
FileHeader->SizeOfOptionalHeader, sizeof(IMAGE_OPTIONAL_HEADER));
287+
303288
}
304289

305290
/* Check the optional header */
@@ -310,22 +295,7 @@ MmCheckFreeldrImageFile(VOID)
310295
(OptionalHeader->SizeOfImage > MAX_FREELDR_PE_SIZE) ||
311296
(OptionalHeader->SectionAlignment != OptionalHeader->FileAlignment))
312297
{
313-
ERR("FreeLdr OptionalHeader is invalid.\n");
314-
FrLdrBugCheckWithMessage(
315-
FREELDR_IMAGE_CORRUPTION,
316-
__FILE__,
317-
__LINE__,
318-
"FreeLdr OptionalHeader is invalid.\n"
319-
"Magic == 0x%lx, expected 0x%lx\n"
320-
"Subsystem == 0x%lx, expected 1 (native)\n"
321-
"ImageBase == 0x%lx, expected 0x%lx\n"
322-
"SizeOfImage == 0x%lx, maximum 0x%lx\n"
323-
"SectionAlignment 0x%lx doesn't match FileAlignment 0x%lx\n",
324-
OptionalHeader->Magic, IMAGE_NT_OPTIONAL_HDR_MAGIC,
325-
OptionalHeader->Subsystem,
326-
OptionalHeader->ImageBase, FREELDR_PE_BASE,
327-
OptionalHeader->SizeOfImage, MAX_FREELDR_PE_SIZE,
328-
OptionalHeader->SectionAlignment, OptionalHeader->FileAlignment);
298+
329299
}
330300

331301
/* Calculate the full image size */

hal/halx86/apic/apicsmp.c

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -111,24 +111,44 @@ ApicStartApplicationProcessor(
111111
_In_ ULONG NTProcessorNumber,
112112
_In_ PHYSICAL_ADDRESS StartupLoc)
113113
{
114+
ULONG SipiVector = (StartupLoc.LowPart) >> 12;
115+
ULONG LapicId = HalpProcessorIdentity[NTProcessorNumber].LapicId;
116+
114117
ASSERT(StartupLoc.HighPart == 0);
115118
ASSERT((StartupLoc.QuadPart & 0xFFF) == 0);
116119
ASSERT((StartupLoc.QuadPart & 0xFFF00FFF) == 0);
117120

118-
/* Init IPI */
119-
ApicRequestGlobalInterrupt(HalpProcessorIdentity[NTProcessorNumber].LapicId, 0,
121+
/* Follow the Intel MP-init algorithm (SDM Vol.3 "MP Initialization Protocol").
122+
* The previous code sent a SINGLE Startup IPI, which is unreliable: the first
123+
* SIPI is frequently missed, so an AP would intermittently fail to come up
124+
* (~1 boot in 3). The spec sends INIT, waits ~10ms, then sends TWO SIPIs
125+
* 200us apart; the second is the backup and is ignored by an AP that already
126+
* started from the first. */
127+
128+
/* Assert INIT IPI */
129+
ApicRequestGlobalInterrupt(LapicId, 0,
120130
APIC_MT_INIT, APIC_TGM_Edge, APIC_DSH_Destination);
121131

122-
/* De-Assert Init IPI */
123-
ApicRequestGlobalInterrupt(HalpProcessorIdentity[NTProcessorNumber].LapicId, 0,
132+
/* De-assert INIT IPI */
133+
ApicRequestGlobalInterrupt(LapicId, 0,
124134
APIC_MT_INIT, APIC_TGM_Level, APIC_DSH_Destination);
125135

126-
/* Stall execution for a bit to give APIC time: MPS Spec - B.4 */
136+
/* Wait 10ms for the target to process INIT (MPS Spec B.4) */
137+
KeStallExecutionProcessor(10000);
138+
139+
/* Startup IPI #1 */
140+
ApicRequestGlobalInterrupt(LapicId, SipiVector,
141+
APIC_MT_Startup, APIC_TGM_Edge, APIC_DSH_Destination);
142+
143+
/* Wait 200us */
127144
KeStallExecutionProcessor(200);
128145

129-
/* Startup IPI */
130-
ApicRequestGlobalInterrupt(HalpProcessorIdentity[NTProcessorNumber].LapicId, (StartupLoc.LowPart) >> 12,
146+
/* Startup IPI #2 (backup; ignored if the AP already started) */
147+
ApicRequestGlobalInterrupt(LapicId, SipiVector,
131148
APIC_MT_Startup, APIC_TGM_Edge, APIC_DSH_Destination);
149+
150+
/* Wait 200us for the AP to begin executing the trampoline */
151+
KeStallExecutionProcessor(200);
132152
}
133153

134154
/* HAL IPI FUNCTIONS **********************************************************/

hal/halx86/apic/rtctimer.c

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -171,15 +171,14 @@ HalpClockInterruptHandler(IN PKTRAP_FRAME TrapFrame)
171171
return;
172172
}
173173

174-
/* Read register C, so that the next interrupt can happen. This MUST hold the
175-
* CMOS lock: the CMOS index+data ports are shared, and another CPU's CMOS
176-
* access (e.g. HalpGetCmosData / RTC reads) interleaving between our index
177-
* write and data read would make us read the wrong register and FAIL to ack
178-
* the RTC IRQ -- which silently stops all further clock interrupts (system
179-
* freeze). The race is rare at 64Hz but frequent at 1024Hz. */
180-
HalpAcquireCmosSpinLock();
174+
/* Read register C, so that the next interrupt can happen.
175+
* NOTE: this used to be wrapped in HalpAcquireCmosSpinLock to avoid an
176+
* interleaved CMOS access on another CPU corrupting the RTC ack. But the
177+
* only window where other CPUs touch CMOS is AP startup, which now runs with
178+
* interrupts disabled on the BSP (KeStartAllProcessors) so the clock cannot
179+
* fire there. Taking the spinlock on EVERY tick at 1024Hz instead added hot-
180+
* path contention that intermittently faulted during boot, so it's removed. */
181181
HalpReadCmos(RTC_REGISTER_C);
182-
HalpReleaseCmosSpinLock();
183182

184183
/* Save increment */
185184
LastIncrement = HalpCurrentTimeIncrement;

ntoskrnl/include/internal/cm_x.h

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,14 @@ FORCEINLINE
1111
VOID
1212
CmpCaptureLockBackTraceByIndex(_In_ ULONG Index)
1313
{
14-
/* Capture the backtrace */
15-
RtlCaptureStackBackTrace(1,
16-
_countof(CmpCacheTable[Index].LockBackTrace),
17-
CmpCacheTable[Index].LockBackTrace,
18-
NULL);
14+
/* DISABLED: this captured a full stack backtrace (RtlCaptureStackBackTrace ->
15+
* RtlWalkFrameChain -> RtlVirtualUnwind) on EVERY config-manager KCB cache
16+
* lock acquire -- which happens constantly during boot (registry access).
17+
* The unwinder is not safe against the high-frequency (1024Hz) clock
18+
* interrupt nesting on it, so this intermittently faulted boot (the single
19+
* most common SMP-boot crash). It is a pure debug aid and a real perf drain,
20+
* so the capture is removed; the field stays for ABI/struct compatibility. */
21+
UNREFERENCED_PARAMETER(Index);
1922
}
2023
#endif
2124

ntoskrnl/ke/amd64/mproc.c

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ KeStartAllProcessors(VOID)
3939
ULONG ProcessorCount = 0;
4040
PAPINFO APInfo;
4141
PKPROCESSOR_STATE ProcessorState;
42+
ULONG_PTR EFlags;
43+
ULONG WaitUs;
4244

4345
//__debugbreak();
4446
//if (KeNumberProcessors <= 2) return;
@@ -133,21 +135,46 @@ KeStartAllProcessors(VOID)
133135
KeLoaderBlock->Process = (ULONG64)PsIdleProcess;
134136
KeLoaderBlock->Prcb = (ULONG64)&APInfo->Pcr.Prcb;
135137

136-
/* Start the next processor */
138+
/* Start the next processor.
139+
* Disable interrupts across the INIT-SIPI-SIPI sequence AND the wait for
140+
* the AP to come up: otherwise the clock ISR (now 1024Hz, and it takes
141+
* the CMOS spinlock) can fire on the BSP mid-sequence and skew the tight
142+
* APIC timing, intermittently losing the AP. KeStallExecutionProcessor
143+
* (used inside) is a TSC busy-wait and works fine with interrupts off. */
137144
DPRINT1("Attempting to start processor #%u\n", ProcessorCount);
145+
EFlags = __readeflags();
146+
_disable();
147+
138148
if (!HalStartNextProcessor(KeLoaderBlock, ProcessorState))
139149
{
150+
__writeeflags(EFlags);
140151
DPRINT1("Failed to start processor #%u\n", ProcessorCount);
141152
break;
142153
}
143154

144-
/* Wait for it to start */
155+
/* Wait for it to start, with a timeout so a processor that fails to come
156+
* up can NOT hang boot forever (the AP clears LoaderBlock->Prcb when up). */
157+
WaitUs = 0;
145158
while (KeLoaderBlock->Prcb)
146159
{
147-
//TODO: Add a time out so we don't wait forever
148160
KeMemoryBarrier();
149-
YieldProcessor();
161+
KeStallExecutionProcessor(50);
162+
WaitUs += 50;
163+
if (WaitUs > 3000000) /* 3 seconds */
164+
{
165+
DPRINT1("Processor #%u did not come up; continuing without it\n",
166+
ProcessorCount);
167+
break;
168+
}
150169
}
170+
171+
__writeeflags(EFlags);
172+
173+
/* If it never came up, stop launching further APs (the shared loader
174+
* block is still claimed by the stuck one). The system boots with the
175+
* processors that did start. */
176+
if (KeLoaderBlock->Prcb)
177+
break;
151178
}
152179

153180
if (KernelStack != NULL)

0 commit comments

Comments
 (0)