OPT: Multithreaded WM rendering

2026-07-30 03:53:24 +00:00 · 2026-04-02 21:36:00 +02:00
parent e60f232812
commit 91b67bd8d5
7 changed files with 567 additions and 461 deletions
--- a/src/sys/smp.c
+++ b/src/sys/smp.c
@@ -9,17 +9,16 @@
 #include "platform.h"
 #include "paging.h"
 #include "process.h"
+#include "work_queue.h"

 extern void serial_write(const char *str);
 extern void serial_write_num(uint32_t n);
 extern void serial_write_hex(uint64_t n);

-// --- Dynamically allocated per-CPU state ---
-static cpu_state_t *cpu_states = NULL;  // Array[cpu_count]
+static cpu_state_t *cpu_states = NULL;
 static uint32_t total_cpus = 0;
 static uint32_t bsp_lapic_id = 0;

-// Get LAPIC ID via CPUID leaf 0x01 (works on all x86_64)
 static uint32_t read_lapic_id(void) {
    uint32_t eax, ebx, ecx, edx;
    asm volatile("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) : "a"(1));
@@ -44,49 +43,37 @@ cpu_state_t *smp_get_cpu(uint32_t cpu_id) {
    return &cpu_states[cpu_id];
 }

-// --- AP Entry Point ---
-// Called by Limine on each Application Processor.
-// The limine_smp_info* is passed as a parameter.
 static void ap_entry(struct limine_smp_info *info) {
-    // 1. Figure out which CPU we are
    uint32_t my_id = (uint32_t)(info->extra_argument);
-
-    // 2. Enable FPU/SSE on this core (same as BSP does in platform_init)
    uint64_t cr0;
    asm volatile("mov %%cr0, %0" : "=r"(cr0));
-    cr0 &= ~(1ULL << 2); // Clear EM
-    cr0 |= (1ULL << 1);  // Set MP
-    cr0 |= (1ULL << 5);  // Set NE
+    cr0 &= ~(1ULL << 2);
+    cr0 |= (1ULL << 1);
+    cr0 |= (1ULL << 5);
    asm volatile("mov %0, %%cr0" : : "r"(cr0));

    uint64_t cr4;
    asm volatile("mov %%cr4, %0" : "=r"(cr4));
-    cr4 |= (1ULL << 9);  // OSFXSR
-    cr4 |= (1ULL << 10); // OSXMMEXCPT
+    cr4 |= (1ULL << 9);
+    cr4 |= (1ULL << 10);
    asm volatile("mov %0, %%cr4" : : "r"(cr4));
    asm volatile("fninit");

-    // 3. Load the shared GDT and properly reload all segments (including CS=0x08)
    extern struct gdt_ptr gdtr;
    extern void gdt_flush(uint64_t);
    gdt_flush((uint64_t)&gdtr);

-    // 4. Load per-CPU TSS
    gdt_load_ap_tss(my_id);

-    // 5. Load the shared IDT
    extern void idt_load(void);
    idt_load();

-    // 6. Load the kernel page tables (same CR3 as BSP — shared kernel space)
    uint64_t kernel_cr3 = paging_get_pml4_phys();
    asm volatile("mov %0, %%cr3" : : "r"(kernel_cr3));

-    // 7. Enable LAPIC on this core so it can receive IPIs
    extern void lapic_enable(void);
    lapic_enable();

-    // 8. Mark ourselves as online
    cpu_states[my_id].online = true;

    serial_write("[SMP] AP ");
@@ -95,25 +82,17 @@ static void ap_entry(struct limine_smp_info *info) {
    serial_write_num(cpu_states[my_id].lapic_id);
    serial_write(")\n");

-    // 9. Initialize the current_process pointer for this CPU
-    // Create a dedicated idle task for this AP (PID 0 is reserved for the BSP)
-    process_t *ap_idle = process_create(NULL, false); // Idle process
+    process_t *ap_idle = process_create(NULL, false); 
    ap_idle->cpu_affinity = my_id;
    process_set_current_for_cpu(my_id, ap_idle);
-
-    // 10. Enable interrupts and enter idle halt loop.
-    // APs will be woken by scheduling IPIs from BSP (vector 0x41).
-    // The IPI handler does context switching for this CPU's processes.
    asm volatile("sti");

-    // Idle loop — APs halt and wait for IPI
-    for (;;) { asm volatile("hlt"); }
+    work_queue_drain_loop();
 }

 // --- SMP Initialization ---
 uint32_t smp_init(struct limine_smp_response *smp_resp) {
    if (!smp_resp || smp_resp->cpu_count <= 1) {
-        // Single CPU system — just set up the BSP entry
        total_cpus = 1;
        cpu_states = (cpu_state_t *)kmalloc(sizeof(cpu_state_t));
        if (!cpu_states) return 1;
@@ -135,7 +114,6 @@ uint32_t smp_init(struct limine_smp_response *smp_resp) {
    serial_write_num(bsp_lapic_id);
    serial_write("\n");

-    // Allocate per-CPU state array
    cpu_states = (cpu_state_t *)kmalloc(total_cpus * sizeof(cpu_state_t));
    if (!cpu_states) {
        serial_write("[SMP] ERROR: Failed to allocate CPU state array!\n");
@@ -145,10 +123,8 @@ uint32_t smp_init(struct limine_smp_response *smp_resp) {
    extern void mem_memset(void *, int, size_t);
    mem_memset(cpu_states, 0, total_cpus * sizeof(cpu_state_t));

-    // Initialize per-CPU GDT/TSS entries for all CPUs
    gdt_init_ap_tss(total_cpus);

-    // Fill in CPU state and start APs
    uint32_t bsp_index = 0;
    for (uint32_t i = 0; i < total_cpus; i++) {
        struct limine_smp_info *cpu = smp_resp->cpus[i];
@@ -156,7 +132,6 @@ uint32_t smp_init(struct limine_smp_response *smp_resp) {
        cpu_states[i].lapic_id = cpu->lapic_id;

        if (cpu->lapic_id == bsp_lapic_id) {
-            // This is the BSP — already running
            cpu_states[i].online = true;
            bsp_index = i;
            serial_write("[SMP] BSP CPU ");
@@ -165,7 +140,6 @@ uint32_t smp_init(struct limine_smp_response *smp_resp) {
            serial_write_num(cpu->lapic_id);
            serial_write(") online\n");
        } else {
-            // Allocate a kernel stack for this AP
            void *ap_stack = kmalloc_aligned(65536, 65536);
            if (!ap_stack) {
                serial_write("[SMP] ERROR: Failed to allocate AP stack!\n");
@@ -175,27 +149,18 @@ uint32_t smp_init(struct limine_smp_response *smp_resp) {
            cpu_states[i].kernel_stack_alloc = ap_stack;
            cpu_states[i].online = false;

-            // Set extra_argument so the AP knows its index
            cpu->extra_argument = i;

-            // Tell Limine to start this AP. Limine sets up the AP's stack
-            // from extra_argument's stack, but we need the goto_address.
-            // Limine will jump to ap_entry with the AP's limine_smp_info*.
-            // Important: Limine creates a temporary stack for the AP, and the
-            // goto_address is where the AP starts executing.
-
            serial_write("[SMP] Starting AP ");
            serial_write_num(i);
            serial_write(" (LAPIC ");
            serial_write_num(cpu->lapic_id);
            serial_write(")...\n");

-            // This atomic write triggers the AP to start executing at ap_entry
            __atomic_store_n(&cpu->goto_address, ap_entry, __ATOMIC_SEQ_CST);
        }
    }

-    // Wait for all APs to come online (with timeout)
    volatile uint32_t timeout = 10000000;
    uint32_t online_count = 0;
    while (timeout-- > 0) {
--- a/src/sys/syscall.c
+++ b/src/sys/syscall.c
@@ -9,6 +9,8 @@
 #include "wm.h"
 #include "fat32.h"
 #include "paging.h"
+#include "work_queue.h"
+#include "smp.h"
 #include "platform.h"
 #include "io.h"
 #include "pci.h"
@@ -630,26 +632,25 @@ static uint64_t syscall_handler_inner(registers_t *regs) {
                if (win->pixels) {
                    int rx = (int)params[0]; int ry = (int)params[1];
                    int rw = (int)params[2]; int rh = (int)params[3];
-                    
+                    int src_w = rw;
                    int src_x_offset = 0;
                    int src_y_offset = 0;
+
                    if (rx < 0) { src_x_offset = -rx; rw += rx; rx = 0; }
                    if (ry < 0) { src_y_offset = -ry; rh += ry; ry = 0; }
                    if (rx + rw > win->w) rw = win->w - rx;
                    if (ry + rh > (win->h - 20)) rh = (win->h - 20) - ry;
-                    
+
                    if (rw > 0 && rh > 0) {
                        for (int y = 0; y < rh; y++) {
                            uint32_t *dest = &win->pixels[(ry + y) * win->w + rx];
-                            uint32_t *src = &image_data[(src_y_offset + y) * (int)params[2] + src_x_offset];
+                            uint32_t *src = &image_data[(src_y_offset + y) * src_w + src_x_offset];
                            for (int x = 0; x < rw; x++) {
                                uint32_t s = src[x];
                                uint8_t alpha = (s >> 24) & 0xFF;
                                if (alpha == 0xFF) {
                                    dest[x] = s;
-                                } else if (alpha == 0) {
-                                    // Skip
-                                } else {
+                                } else if (alpha > 0) {
                                    uint32_t d = dest[x];
                                    uint32_t rb = ((s & 0xFF00FF) * alpha + (d & 0xFF00FF) * (255 - alpha)) >> 8;
                                    uint32_t g = ((s & 0x00FF00) * alpha + (d & 0x00FF00) * (255 - alpha)) >> 8;