shader_jit: Fix/optimize conditional evaluation (#234)

* shader_jit: Add conditional unit-tests

Tests all permutations of X, Y, AND, OR with each possible input value.

* video_core: Fix shader-interpreter conditional-code initialization

Rather than reserving the incoming state of the conditional codes, the
shader-interpreter was setting them both to false. In pretty much all
cases, the initial state of a shaderunit can be zero-initialized
statically. Just running the interpreter shouldn't necessarily reset the
conditional codes though.  The JIT loads incoming conditional codes
while the shader-interpreter resets them to false. This makes the
interpreter match the behavior of the shader-jit.

* shader_jit_a64: Fix/optimize conditional evaluation

Fix some of the regressions introduced by the previous optimization.
EOR does not support a constant of `0` in its immediate. In these cases
the COND{0,1} registers can be utilized immediately.

* shader_jit_x64: Fix conditional evaluation extended-bit hazard

The unit test seems to have identified a bug in the x64 jit too. The x64
jit was doing 32-bit comparisons despite the condition flags being 8-bit
values and is sensitive to garbage being in the upper 24 bits of the
register. This is fixed by using the proper 8-bit register types rather
than the 32-bit ones(`eax,`ebx` -> `al`, `bl`).

* shader_jit_x64: Zero-extend conditional-code bytes

`mov` was doing a partial update of bits within the register, allowing
garbage to be introduced in the upper bits of the register.
This commit is contained in:
Wunk
2024-08-20 01:19:04 -07:00
committed by PabloMK7
parent cfc74d2d08
commit 0ea9f23f0c
6 changed files with 144 additions and 40 deletions

View File

@@ -52,9 +52,6 @@ static void RunInterpreter(const ShaderSetup& setup, ShaderUnit& state,
boost::circular_buffer<LoopStackElement> loop_stack(4);
u32 program_counter = entry_point;
state.conditional_code[0] = false;
state.conditional_code[1] = false;
const auto do_if = [&](Instruction instr, bool condition) {
if (condition) {
if_stack.push_back({

View File

@@ -386,28 +386,50 @@ void JitShader::Compile_SanitizedMul(QReg src1, QReg src2, QReg scratch0) {
}
void JitShader::Compile_EvaluateCondition(Instruction instr) {
const u8 refx = instr.flow_control.refx.Value();
const u8 refy = instr.flow_control.refy.Value();
const bool refx = instr.flow_control.refx.Value();
const bool refy = instr.flow_control.refy.Value();
switch (instr.flow_control.op) {
// Note: NXOR is used below to check for equality
case Instruction::FlowControlType::Or:
EOR(XSCRATCH0, COND0, refx ^ 1);
EOR(XSCRATCH1, COND1, refy ^ 1);
ORR(XSCRATCH0, XSCRATCH0, XSCRATCH1);
case Instruction::FlowControlType::Or: {
XReg OpX = XSCRATCH0;
if (!refx) {
EOR(OpX, COND0, u8(refx) ^ 1);
} else {
OpX = COND0;
}
XReg OpY = XSCRATCH1;
if (!refy) {
EOR(OpY, COND1, u8(refy) ^ 1);
} else {
OpY = COND1;
}
ORR(XSCRATCH0, OpX, OpY);
CMP(XSCRATCH0, 0);
break;
}
// Note: TST will AND two registers and set the EQ/NE flags on the result
case Instruction::FlowControlType::And:
EOR(XSCRATCH0, COND0, refx ^ 1);
EOR(XSCRATCH1, COND1, refy ^ 1);
TST(XSCRATCH0, XSCRATCH1);
case Instruction::FlowControlType::And: {
XReg OpX = XSCRATCH0;
if (!refx) {
EOR(OpX, COND0, u8(refx) ^ 1);
} else {
OpX = COND0;
}
XReg OpY = XSCRATCH1;
if (!refy) {
EOR(OpY, COND1, u8(refy) ^ 1);
} else {
OpY = COND1;
}
TST(OpX, OpY);
break;
}
case Instruction::FlowControlType::JustX:
CMP(COND0, refx);
CMP(COND0, u8(refx) ^ 1);
break;
case Instruction::FlowControlType::JustY:
CMP(COND1, refy);
CMP(COND1, u8(refy) ^ 1);
break;
default:
UNREACHABLE();

View File

@@ -401,29 +401,29 @@ void JitShader::Compile_EvaluateCondition(Instruction instr) {
// Note: NXOR is used below to check for equality
switch (instr.flow_control.op) {
case Instruction::FlowControlType::Or:
mov(eax, COND0);
mov(ebx, COND1);
xor_(eax, (instr.flow_control.refx.Value() ^ 1));
xor_(ebx, (instr.flow_control.refy.Value() ^ 1));
or_(eax, ebx);
mov(al, COND0.cvt8());
mov(bl, COND1.cvt8());
xor_(al, (instr.flow_control.refx.Value() ^ 1));
xor_(bl, (instr.flow_control.refy.Value() ^ 1));
or_(al, bl);
break;
case Instruction::FlowControlType::And:
mov(eax, COND0);
mov(ebx, COND1);
xor_(eax, (instr.flow_control.refx.Value() ^ 1));
xor_(ebx, (instr.flow_control.refy.Value() ^ 1));
and_(eax, ebx);
mov(al, COND0);
mov(bl, COND1);
xor_(al, (instr.flow_control.refx.Value() ^ 1));
xor_(bl, (instr.flow_control.refy.Value() ^ 1));
and_(al, bl);
break;
case Instruction::FlowControlType::JustX:
mov(eax, COND0);
xor_(eax, (instr.flow_control.refx.Value() ^ 1));
mov(al, COND0);
xor_(al, (instr.flow_control.refx.Value() ^ 1));
break;
case Instruction::FlowControlType::JustY:
mov(eax, COND1);
xor_(eax, (instr.flow_control.refy.Value() ^ 1));
mov(al, COND1);
xor_(al, (instr.flow_control.refy.Value() ^ 1));
break;
}
}
@@ -1002,8 +1002,8 @@ void JitShader::Compile(const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_
mov(LOOPCOUNT_REG, dword[STATE + offsetof(ShaderUnit, address_registers[2])]);
// Load conditional code
mov(COND0, byte[STATE + offsetof(ShaderUnit, conditional_code[0])]);
mov(COND1, byte[STATE + offsetof(ShaderUnit, conditional_code[1])]);
movzx(COND0, byte[STATE + offsetof(ShaderUnit, conditional_code[0])]);
movzx(COND1, byte[STATE + offsetof(ShaderUnit, conditional_code[1])]);
// Used to set a register to one
static const __m128 one = {1.f, 1.f, 1.f, 1.f};