OPT: Multithreaded WM rendering

This commit is contained in:
boreddevnl
2026-04-02 21:36:00 +02:00
parent e60f232812
commit 91b67bd8d5
7 changed files with 567 additions and 461 deletions

View File

@@ -9,17 +9,16 @@
#include "platform.h"
#include "paging.h"
#include "process.h"
#include "work_queue.h"
extern void serial_write(const char *str);
extern void serial_write_num(uint32_t n);
extern void serial_write_hex(uint64_t n);
// --- Dynamically allocated per-CPU state ---
static cpu_state_t *cpu_states = NULL; // Array[cpu_count]
static cpu_state_t *cpu_states = NULL;
static uint32_t total_cpus = 0;
static uint32_t bsp_lapic_id = 0;
// Get LAPIC ID via CPUID leaf 0x01 (works on all x86_64)
static uint32_t read_lapic_id(void) {
uint32_t eax, ebx, ecx, edx;
asm volatile("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) : "a"(1));
@@ -44,49 +43,37 @@ cpu_state_t *smp_get_cpu(uint32_t cpu_id) {
return &cpu_states[cpu_id];
}
// --- AP Entry Point ---
// Called by Limine on each Application Processor.
// The limine_smp_info* is passed as a parameter.
static void ap_entry(struct limine_smp_info *info) {
// 1. Figure out which CPU we are
uint32_t my_id = (uint32_t)(info->extra_argument);
// 2. Enable FPU/SSE on this core (same as BSP does in platform_init)
uint64_t cr0;
asm volatile("mov %%cr0, %0" : "=r"(cr0));
cr0 &= ~(1ULL << 2); // Clear EM
cr0 |= (1ULL << 1); // Set MP
cr0 |= (1ULL << 5); // Set NE
cr0 &= ~(1ULL << 2);
cr0 |= (1ULL << 1);
cr0 |= (1ULL << 5);
asm volatile("mov %0, %%cr0" : : "r"(cr0));
uint64_t cr4;
asm volatile("mov %%cr4, %0" : "=r"(cr4));
cr4 |= (1ULL << 9); // OSFXSR
cr4 |= (1ULL << 10); // OSXMMEXCPT
cr4 |= (1ULL << 9);
cr4 |= (1ULL << 10);
asm volatile("mov %0, %%cr4" : : "r"(cr4));
asm volatile("fninit");
// 3. Load the shared GDT and properly reload all segments (including CS=0x08)
extern struct gdt_ptr gdtr;
extern void gdt_flush(uint64_t);
gdt_flush((uint64_t)&gdtr);
// 4. Load per-CPU TSS
gdt_load_ap_tss(my_id);
// 5. Load the shared IDT
extern void idt_load(void);
idt_load();
// 6. Load the kernel page tables (same CR3 as BSP — shared kernel space)
uint64_t kernel_cr3 = paging_get_pml4_phys();
asm volatile("mov %0, %%cr3" : : "r"(kernel_cr3));
// 7. Enable LAPIC on this core so it can receive IPIs
extern void lapic_enable(void);
lapic_enable();
// 8. Mark ourselves as online
cpu_states[my_id].online = true;
serial_write("[SMP] AP ");
@@ -95,25 +82,17 @@ static void ap_entry(struct limine_smp_info *info) {
serial_write_num(cpu_states[my_id].lapic_id);
serial_write(")\n");
// 9. Initialize the current_process pointer for this CPU
// Create a dedicated idle task for this AP (PID 0 is reserved for the BSP)
process_t *ap_idle = process_create(NULL, false); // Idle process
process_t *ap_idle = process_create(NULL, false);
ap_idle->cpu_affinity = my_id;
process_set_current_for_cpu(my_id, ap_idle);
// 10. Enable interrupts and enter idle halt loop.
// APs will be woken by scheduling IPIs from BSP (vector 0x41).
// The IPI handler does context switching for this CPU's processes.
asm volatile("sti");
// Idle loop — APs halt and wait for IPI
for (;;) { asm volatile("hlt"); }
work_queue_drain_loop();
}
// --- SMP Initialization ---
uint32_t smp_init(struct limine_smp_response *smp_resp) {
if (!smp_resp || smp_resp->cpu_count <= 1) {
// Single CPU system — just set up the BSP entry
total_cpus = 1;
cpu_states = (cpu_state_t *)kmalloc(sizeof(cpu_state_t));
if (!cpu_states) return 1;
@@ -135,7 +114,6 @@ uint32_t smp_init(struct limine_smp_response *smp_resp) {
serial_write_num(bsp_lapic_id);
serial_write("\n");
// Allocate per-CPU state array
cpu_states = (cpu_state_t *)kmalloc(total_cpus * sizeof(cpu_state_t));
if (!cpu_states) {
serial_write("[SMP] ERROR: Failed to allocate CPU state array!\n");
@@ -145,10 +123,8 @@ uint32_t smp_init(struct limine_smp_response *smp_resp) {
extern void mem_memset(void *, int, size_t);
mem_memset(cpu_states, 0, total_cpus * sizeof(cpu_state_t));
// Initialize per-CPU GDT/TSS entries for all CPUs
gdt_init_ap_tss(total_cpus);
// Fill in CPU state and start APs
uint32_t bsp_index = 0;
for (uint32_t i = 0; i < total_cpus; i++) {
struct limine_smp_info *cpu = smp_resp->cpus[i];
@@ -156,7 +132,6 @@ uint32_t smp_init(struct limine_smp_response *smp_resp) {
cpu_states[i].lapic_id = cpu->lapic_id;
if (cpu->lapic_id == bsp_lapic_id) {
// This is the BSP — already running
cpu_states[i].online = true;
bsp_index = i;
serial_write("[SMP] BSP CPU ");
@@ -165,7 +140,6 @@ uint32_t smp_init(struct limine_smp_response *smp_resp) {
serial_write_num(cpu->lapic_id);
serial_write(") online\n");
} else {
// Allocate a kernel stack for this AP
void *ap_stack = kmalloc_aligned(65536, 65536);
if (!ap_stack) {
serial_write("[SMP] ERROR: Failed to allocate AP stack!\n");
@@ -175,27 +149,18 @@ uint32_t smp_init(struct limine_smp_response *smp_resp) {
cpu_states[i].kernel_stack_alloc = ap_stack;
cpu_states[i].online = false;
// Set extra_argument so the AP knows its index
cpu->extra_argument = i;
// Tell Limine to start this AP. Limine sets up the AP's stack
// from extra_argument's stack, but we need the goto_address.
// Limine will jump to ap_entry with the AP's limine_smp_info*.
// Important: Limine creates a temporary stack for the AP, and the
// goto_address is where the AP starts executing.
serial_write("[SMP] Starting AP ");
serial_write_num(i);
serial_write(" (LAPIC ");
serial_write_num(cpu->lapic_id);
serial_write(")...\n");
// This atomic write triggers the AP to start executing at ap_entry
__atomic_store_n(&cpu->goto_address, ap_entry, __ATOMIC_SEQ_CST);
}
}
// Wait for all APs to come online (with timeout)
volatile uint32_t timeout = 10000000;
uint32_t online_count = 0;
while (timeout-- > 0) {

View File

@@ -9,6 +9,8 @@
#include "wm.h"
#include "fat32.h"
#include "paging.h"
#include "work_queue.h"
#include "smp.h"
#include "platform.h"
#include "io.h"
#include "pci.h"
@@ -630,26 +632,25 @@ static uint64_t syscall_handler_inner(registers_t *regs) {
if (win->pixels) {
int rx = (int)params[0]; int ry = (int)params[1];
int rw = (int)params[2]; int rh = (int)params[3];
int src_w = rw;
int src_x_offset = 0;
int src_y_offset = 0;
if (rx < 0) { src_x_offset = -rx; rw += rx; rx = 0; }
if (ry < 0) { src_y_offset = -ry; rh += ry; ry = 0; }
if (rx + rw > win->w) rw = win->w - rx;
if (ry + rh > (win->h - 20)) rh = (win->h - 20) - ry;
if (rw > 0 && rh > 0) {
for (int y = 0; y < rh; y++) {
uint32_t *dest = &win->pixels[(ry + y) * win->w + rx];
uint32_t *src = &image_data[(src_y_offset + y) * (int)params[2] + src_x_offset];
uint32_t *src = &image_data[(src_y_offset + y) * src_w + src_x_offset];
for (int x = 0; x < rw; x++) {
uint32_t s = src[x];
uint8_t alpha = (s >> 24) & 0xFF;
if (alpha == 0xFF) {
dest[x] = s;
} else if (alpha == 0) {
// Skip
} else {
} else if (alpha > 0) {
uint32_t d = dest[x];
uint32_t rb = ((s & 0xFF00FF) * alpha + (d & 0xFF00FF) * (255 - alpha)) >> 8;
uint32_t g = ((s & 0x00FF00) * alpha + (d & 0x00FF00) * (255 - alpha)) >> 8;