diff --git a/include/PICA/dynapica/shader_rec_emitter_x64.hpp b/include/PICA/dynapica/shader_rec_emitter_x64.hpp
index d22ed371..0338911c 100644
--- a/include/PICA/dynapica/shader_rec_emitter_x64.hpp
+++ b/include/PICA/dynapica/shader_rec_emitter_x64.hpp
@@ -91,6 +91,7 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
 	void recCMP(const PICAShader& shader, u32 instruction);
 	void recDP3(const PICAShader& shader, u32 instruction);
 	void recDP4(const PICAShader& shader, u32 instruction);
+	void recDPH(const PICAShader& shader, u32 instruction);
 	void recEMIT(const PICAShader& shader, u32 instruction);
 	void recEND(const PICAShader& shader, u32 instruction);
 	void recEX2(const PICAShader& shader, u32 instruction);
@@ -111,7 +112,6 @@ class ShaderEmitter : public Xbyak::CodeGenerator {
 	void recRSQ(const PICAShader& shader, u32 instruction);
 	void recSETEMIT(const PICAShader& shader, u32 instruction);
 	void recSGE(const PICAShader& shader, u32 instruction);
-	void recSGEI(const PICAShader& shader, u32 instruction);
 	void recSLT(const PICAShader& shader, u32 instruction);
 
 	MAKE_LOG_FUNCTION(log, shaderJITLogger)
diff --git a/include/PICA/shader.hpp b/include/PICA/shader.hpp
index 0f3154f1..48db777d 100644
--- a/include/PICA/shader.hpp
+++ b/include/PICA/shader.hpp
@@ -23,6 +23,7 @@ namespace ShaderOpcodes {
 		LG2 = 0x06,
 		LIT = 0x07,
 		MUL = 0x08,
+		SGE = 0x09,
 		SLT = 0x0A,
 		FLR = 0x0B,
 		MAX = 0x0C,
diff --git a/include/kernel/kernel.hpp b/include/kernel/kernel.hpp
index 2db7cdda..99687ee1 100644
--- a/include/kernel/kernel.hpp
+++ b/include/kernel/kernel.hpp
@@ -52,6 +52,9 @@ class Kernel {
 	// Top 8 bits are the major version, bottom 8 are the minor version
 	u16 kernelVersion = 0;
 
+	// Shows whether a reschedule will be need
+	bool needReschedule = false;
+
 	Handle makeArbiter();
 	Handle makeProcess(u32 id);
 	Handle makePort(const char* name);
@@ -73,7 +76,6 @@ private:
 	void switchThread(int newThreadIndex);
 	void sortThreads();
 	std::optional<int> getNextThread();
-	void switchToNextThread();
 	void rescheduleThreads();
 	bool canThreadRun(const Thread& t);
 	bool shouldWaitOnObject(KernelObject* object);
@@ -168,6 +170,15 @@ public:
 	void serviceSVC(u32 svc);
 	void reset();
 
+	void requireReschedule() { needReschedule = true; }
+
+	void evalReschedule() {
+		if (needReschedule) {
+			needReschedule = false;
+			rescheduleThreads();
+		}
+	}
+
 	Handle makeObject(KernelObjectType type) {
 		if (handleCounter > KernelHandles::Max) [[unlikely]] {
 			Helpers::panic("Hlep we somehow created enough kernel objects to overflow this thing");
diff --git a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp
index 13eb630e..7bcf4b46 100644
--- a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp
+++ b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp
@@ -143,6 +143,7 @@ void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) {
 			break;
 		case ShaderOpcodes::DP3: recDP3(shaderUnit, instruction); break;
 		case ShaderOpcodes::DP4: recDP4(shaderUnit, instruction); break;
+		case ShaderOpcodes::DPH: recDPH(shaderUnit, instruction); break;
 		case ShaderOpcodes::END: recEND(shaderUnit, instruction); break;
 		case ShaderOpcodes::EX2: recEX2(shaderUnit, instruction); break;
 		case ShaderOpcodes::FLR: recFLR(shaderUnit, instruction); break;
@@ -179,6 +180,10 @@ void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) {
 		case ShaderOpcodes::SLTI:
 			recSLT(shaderUnit, instruction); break;
 
+		case ShaderOpcodes::SGE:
+		case ShaderOpcodes::SGEI:
+			recSGE(shaderUnit, instruction); break;
+
 		default:
 			Helpers::panic("Shader JIT: Unimplemented PICA opcode %X", opcode);
 	}
@@ -525,6 +530,30 @@ void ShaderEmitter::recDP4(const PICAShader& shader, u32 instruction) {
 	storeRegister(src1_xmm, shader, dest, operandDescriptor);
 }
 
+void ShaderEmitter::recDPH(const PICAShader& shader, u32 instruction) {
+	const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
+	const u32 src1 = getBits<12, 7>(instruction);
+	const u32 src2 = getBits<7, 5>(instruction);  // src2 coming first because PICA moment
+	const u32 idx = getBits<19, 2>(instruction);
+	const u32 dest = getBits<21, 5>(instruction);
+
+	// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
+	loadRegister<1>(src1_xmm, shader, src1, idx, operandDescriptor);
+	loadRegister<2>(src2_xmm, shader, src2, 0, operandDescriptor);
+
+	// Attach 1.0 to the w component of src1
+	if (haveSSE4_1) {
+		blendps(src1_xmm, xword[rip + onesVector], 0b1000);
+	} else {
+		movaps(scratch1, src1_xmm);
+		unpckhps(scratch1, xword[rip + onesVector]);
+		unpcklpd(src1_xmm, scratch1);
+	}
+
+	dpps(src1_xmm, src2_xmm, 0b11111111);  // 4-lane dot product between the 2 registers, store the result in all lanes of scratch1 similarly to PICA
+	storeRegister(src1_xmm, shader, dest, operandDescriptor);
+}
+
 void ShaderEmitter::recMAX(const PICAShader& shader, u32 instruction) {
 	const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
 	const u32 src1 = getBits<12, 7>(instruction);
@@ -656,6 +685,24 @@ void ShaderEmitter::recSLT(const PICAShader& shader, u32 instruction) {
 	storeRegister(src1_xmm, shader, dest, operandDescriptor);
 }
 
+void ShaderEmitter::recSGE(const PICAShader& shader, u32 instruction) {
+	const bool isSGEI = (instruction >> 26) == ShaderOpcodes::SGEI;
+	const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
+
+	const u32 src1 = isSGEI ? getBits<14, 5>(instruction) : getBits<12, 7>(instruction);
+	const u32 src2 = isSGEI ? getBits<7, 7>(instruction) : getBits<7, 5>(instruction);
+	const u32 idx = getBits<19, 2>(instruction);
+	const u32 dest = getBits<21, 5>(instruction);
+
+	loadRegister<1>(src1_xmm, shader, src1, isSGEI ? 0 : idx, operandDescriptor);
+	loadRegister<2>(src2_xmm, shader, src2, isSGEI ? idx : 0, operandDescriptor);
+	
+	// SSE does not have a cmpgeps instruction so we turn src1 >= src2 to src2 <= src1, result in src2
+	cmpleps(src2_xmm, src1_xmm);
+	andps(src2_xmm, xword[rip + onesVector]);
+	storeRegister(src2_xmm, shader, dest, operandDescriptor);
+}
+
 void ShaderEmitter::recCMP(const PICAShader& shader, u32 instruction) {
 	const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
 	const u32 src1 = getBits<12, 7>(instruction);
diff --git a/src/core/kernel/address_arbiter.cpp b/src/core/kernel/address_arbiter.cpp
index 9fb9f7be..8c07b423 100644
--- a/src/core/kernel/address_arbiter.cpp
+++ b/src/core/kernel/address_arbiter.cpp
@@ -87,7 +87,7 @@ void Kernel::arbitrateAddress() {
 			Helpers::panic("ArbitrateAddress: Unimplemented type %s", arbitrationTypeToString(type));
 	}
 
-	rescheduleThreads();
+	requireReschedule();
 }
 
 // Signal up to "threadCount" threads waiting on the arbiter indicated by "waitingAddress"
diff --git a/src/core/kernel/events.cpp b/src/core/kernel/events.cpp
index e117dd62..ca72add1 100644
--- a/src/core/kernel/events.cpp
+++ b/src/core/kernel/events.cpp
@@ -35,22 +35,15 @@ bool Kernel::signalEvent(Handle handle) {
 
 	// Check if there's any thread waiting on this event
 	if (event->waitlist != 0) {
-		// One-shot events get cleared once they are acquired by some thread and only wake up 1 thread at a time
+		wakeupAllThreads(event->waitlist, handle);
+		event->waitlist = 0;  // No threads waiting;
+
 		if (event->resetType == ResetType::OneShot) {
-			int index = wakeupOneThread(event->waitlist, handle); // Wake up one thread with the highest priority
-			event->waitlist ^= (1ull << index); // Remove thread from waitlist
 			event->fired = false;
-		} else {
-			wakeupAllThreads(event->waitlist, handle);
-			event->waitlist = 0; // No threads waiting;
 		}
-
-		// We must reschedule our threads if we signalled one. Some games such as FE: Awakening rely on this
-		// If this does not happen, we can have phenomena such as a thread waiting up a higher priority thread,
-		// and the higher priority thread just never running
-		rescheduleThreads();
 	}
-
+	
+	rescheduleThreads();
 	return true;
 }
 
@@ -121,7 +114,6 @@ void Kernel::waitSynchronization1() {
 	if (!shouldWaitOnObject(object)) {
 		acquireSyncObject(object, threads[currentThreadIndex]); // Acquire the object since it's ready
 		regs[0] = Result::Success;
-		rescheduleThreads();
 	} else {
 		// Timeout is 0, don't bother waiting, instantly timeout
 		if (ns == 0) {
@@ -141,7 +133,7 @@ void Kernel::waitSynchronization1() {
 		// Add the current thread to the object's wait list
 		object->getWaitlist() |= (1ull << currentThreadIndex);
 
-		switchToNextThread();
+		requireReschedule();
 	}
 }
 
@@ -204,14 +196,13 @@ void Kernel::waitSynchronizationN() {
 
 	auto& t = threads[currentThreadIndex];
 
-	// We only need to wait on one object. Easy...?!
+	// We only need to wait on one object. Easy.
 	if (!waitAll) {
 		// If there's ready objects, acquire the first one and return
 		if (oneObjectReady) {
 			regs[0] = Result::Success;
 			regs[1] = firstReadyObjectIndex; // Return index of the acquired object
 			acquireSyncObject(waitObjects[firstReadyObjectIndex].second, t); // Acquire object
-			rescheduleThreads();
 			return;
 		}
 
@@ -229,8 +220,8 @@ void Kernel::waitSynchronizationN() {
 			waitObjects[i].second->getWaitlist() |= (1ull << currentThreadIndex); // And add the thread to the object's waitlist
 		}
 
-		switchToNextThread();
+		requireReschedule();
 	} else {
-		Helpers::panic("WaitSynchronizatioN with waitAll");
+		Helpers::panic("WaitSynchronizationN with waitAll");
 	}
 }
\ No newline at end of file
diff --git a/src/core/kernel/kernel.cpp b/src/core/kernel/kernel.cpp
index 8f3aeda0..c48c8f18 100644
--- a/src/core/kernel/kernel.cpp
+++ b/src/core/kernel/kernel.cpp
@@ -61,6 +61,8 @@ void Kernel::serviceSVC(u32 svc) {
 		case 0x3D: outputDebugString(); break;
 		default: Helpers::panic("Unimplemented svc: %X @ %08X", svc, regs[15]); break;
 	}
+
+	evalReschedule();
 }
 
 void Kernel::setVersion(u8 major, u8 minor) {
@@ -140,6 +142,8 @@ void Kernel::reset() {
 	threadIndices.clear();
 	serviceManager.reset();
 
+	needReschedule = false;
+
 	// Allocate handle #0 to a dummy object and make a main process object
 	makeObject(KernelObjectType::Dummy);
 	currentProcess = makeProcess(1); // Use ID = 1 for main process
diff --git a/src/core/kernel/ports.cpp b/src/core/kernel/ports.cpp
index a7351fd0..84c8cc05 100644
--- a/src/core/kernel/ports.cpp
+++ b/src/core/kernel/ports.cpp
@@ -76,6 +76,11 @@ void Kernel::sendSyncRequest() {
 	u32 messagePointer = getTLSPointer() + 0x80; // The message is stored starting at TLS+0x80
 	logSVC("SendSyncRequest(session handle = %X)\n", handle);
 
+	// Service calls via SendSyncRequest and file access needs to put the caller to sleep for a given amount of time
+	// To make sure that the other threads don't get starved. Various games rely on this (including Sonic Boom: Shattering Crystal it seems)
+	constexpr u64 syncRequestDelayNs = 39000;
+	sleepThread(syncRequestDelayNs);
+
 	// The sync request is being sent at a service rather than whatever port, so have the service manager intercept it
 	if (KernelHandles::isServiceHandle(handle)) {
 		// The service call might cause a reschedule and change threads. Hence, set r0 before executing the service call
diff --git a/src/core/kernel/threads.cpp b/src/core/kernel/threads.cpp
index 587d5fc4..2e39b620 100644
--- a/src/core/kernel/threads.cpp
+++ b/src/core/kernel/threads.cpp
@@ -82,32 +82,26 @@ std::optional<int> Kernel::getNextThread() {
 	return std::nullopt;
 }
 
-void Kernel::switchToNextThread() {
-	std::optional<int> newThreadIndex = getNextThread();
-
-	if (!newThreadIndex.has_value()) {
-		log("Kernel tried to switch to the next thread but none found. Switching to random thread\n");
-		assert(aliveThreadCount != 0);
-		Helpers::panic("rpog");
-
-		int index;
-		do {
-			index = rand() % threadCount;
-		} while (threads[index].status == ThreadStatus::Dead); // TODO: Pray this doesn't hang
-
-		switchThread(index);
-	} else {
-		switchThread(newThreadIndex.value());
-	}
-}
-
-// See if there;s a higher priority, ready thread and switch to that
+// See if there is a higher priority, ready thread and switch to that
 void Kernel::rescheduleThreads() {
+	Thread& current = threads[currentThreadIndex];  // Current running thread
+
+	// If the current thread is running and hasn't gone to sleep or whatever, set it to Ready instead of Running
+	// So that getNextThread will evaluate it properly
+	if (current.status == ThreadStatus::Running) {
+		current.status = ThreadStatus::Ready;
+	}
+	ThreadStatus currentStatus = current.status;
 	std::optional<int> newThreadIndex = getNextThread();
 
-	if (newThreadIndex.has_value() && newThreadIndex.value() != currentThreadIndex) {
-		threads[currentThreadIndex].status = ThreadStatus::Ready;
+	// Case 1: A thread can run
+	if (newThreadIndex.has_value()) {
 		switchThread(newThreadIndex.value());
+	} 
+	
+	// Case 2: No other thread can run, straight to the idle thread
+	else {
+		switchThread(idleThreadIndex);
 	}
 }
 
@@ -184,6 +178,7 @@ void Kernel::releaseMutex(Mutex* moo) {
 	// If the lock count reached 0 then the thread no longer owns the mootex and it can be given to a new one
 	if (moo->lockCount == 0) {
 		moo->locked = false;
+
 		if (moo->waitlist != 0) {
 			int index = wakeupOneThread(moo->waitlist, moo->handle); // Wake up one thread and get its index
 			moo->waitlist ^= (1ull << index); // Remove thread from waitlist
@@ -194,7 +189,7 @@ void Kernel::releaseMutex(Mutex* moo) {
 			moo->ownerThread = index;
 		}
 
-		rescheduleThreads();
+		requireReschedule();
 	}
 }
 
@@ -210,7 +205,7 @@ void Kernel::sleepThreadOnArbiter(u32 waitingAddress) {
 	t.status = ThreadStatus::WaitArbiter;
 	t.waitingAddress = waitingAddress;
 
-	switchToNextThread();
+	requireReschedule();
 }
 
 // Acquires an object that is **ready to be acquired** without waiting on it
@@ -226,7 +221,13 @@ void Kernel::acquireSyncObject(KernelObject* object, const Thread& thread) {
 
 		case KernelObjectType::Mutex: {
 			Mutex* moo = object->getData<Mutex>();
-			moo->locked = true; // Set locked to true, whether it's false or not because who cares
+
+			// Only reschedule if we're acquiring the mutex for the first time
+			if (!moo->locked) {
+				moo->locked = true;
+				requireReschedule();
+			}
+
 			// Increment lock count by 1. If a thread acquires a mootex multiple times, it needs to release it until count == 0
 			// For the mootex to be free.
 			moo->lockCount++;
@@ -338,20 +339,31 @@ void Kernel::wakeupAllThreads(u64 waitlist, Handle handle) {
 void Kernel::sleepThread(s64 ns) {
 	if (ns < 0) {
 		Helpers::panic("Sleeping a thread for a negative amount of ns");
-	} else if (ns == 0) { // Used when we want to force a thread switch
-		std::optional<int> newThreadIndex = getNextThread();
-		// If there's no other thread waiting, don't bother yielding
-		if (newThreadIndex.has_value()) {
-			threads[currentThreadIndex].status = ThreadStatus::Ready;
-			switchThread(newThreadIndex.value());
-		}
-	} else { // If we're sleeping for > 0 ns
+	} else if (ns == 0) {
+		// TODO: This is garbage, but it works so eh we can keep it for now
 		Thread& t = threads[currentThreadIndex];
+
+		// See if a thread other than this and the idle thread is waiting to run by temp marking the current function as dead and searching
+		// If there is another thread to run, then run it. Otherwise, go back to this thread, not to the idle thread
+		t.status = ThreadStatus::Dead;
+		auto nextThreadIndex = getNextThread();
+		t.status = ThreadStatus::Ready;
+
+		if (nextThreadIndex.has_value()) {
+			const auto index = nextThreadIndex.value();
+
+			if (index != idleThreadIndex) {
+				switchThread(index);
+			}
+		}
+	} else {  // If we're sleeping for >= 0 ns
+		Thread& t = threads[currentThreadIndex];
+
 		t.status = ThreadStatus::WaitSleep;
 		t.waitingNanoseconds = ns;
 		t.sleepTick = cpu.getTicks();
 
-		switchToNextThread();
+		requireReschedule();
 	}
 }
 
@@ -374,7 +386,7 @@ void Kernel::createThread() {
 
 	regs[0] = Result::Success;
 	regs[1] = makeThread(entrypoint, initialSP, priority, id, arg, ThreadStatus::Ready);
-	rescheduleThreads();
+	requireReschedule();
 }
 
 // void SleepThread(s64 nanoseconds)
@@ -448,7 +460,7 @@ void Kernel::setThreadPriority() {
 		}
 	}
 	sortThreads();
-	rescheduleThreads();
+	requireReschedule();
 }
 
 void Kernel::exitThread() {
@@ -472,7 +484,7 @@ void Kernel::exitThread() {
 		t.threadsWaitingForTermination = 0; // No other threads waiting
 	}
 
-	switchToNextThread();
+	requireReschedule();
 }
 
 void Kernel::svcCreateMutex() {
diff --git a/src/emulator.cpp b/src/emulator.cpp
index fd5efe6b..2e7cd521 100644
--- a/src/emulator.cpp
+++ b/src/emulator.cpp
@@ -357,6 +357,8 @@ void Emulator::run() {
 
 			hid.updateInputs(cpu.getTicks());
 		}
+		// TODO: Should this be uncommented?
+		// kernel.evalReschedule();
 
 		// Update inputs in the HID module
 		SDL_GL_SwapWindow(window);
diff --git a/src/host_shaders/opengl_fragment_shader.frag b/src/host_shaders/opengl_fragment_shader.frag
index f6461094..5b6e6830 100644
--- a/src/host_shaders/opengl_fragment_shader.frag
+++ b/src/host_shaders/opengl_fragment_shader.frag
@@ -228,10 +228,18 @@ void calcLighting(out vec4 primary_color, out vec4 secondary_color) {
 			decodeFP(bitfieldExtract(GPUREG_LIGHTi_VECTOR_HIGH, 0, 16), 5, 10)
 		));
 
-		// Positional Light
-		if (bitfieldExtract(GPUREG_LIGHTi_CONFIG, 0, 1) == 0) error_unimpl = true;
+		vec3 half_vector;
 
-		vec3 half_vector = normalize(normalize(light_vector) + view);
+		// Positional Light
+		if (bitfieldExtract(GPUREG_LIGHTi_CONFIG, 0, 1) == 0) {
+			error_unimpl = true;
+			// half_vector = normalize(normalize(light_vector + v_view) + view);
+		}
+
+		// Directional light
+		else {
+			half_vector = normalize(normalize(light_vector) + view);
+		}
 
 		for (int c = 0; c < 7; c++) {
 			if (bitfieldExtract(GPUREG_LIGHTING_CONFIG1, 16 + c, 1) == 0) {