Moar shader decompiler (#559)

* Renderer: Add prepareForDraw callback * Add fmt submodule and port shader decompiler instructions to it * Add shader acceleration setting * Hook up vertex shaders to shader cache * Shader decompiler: Fix redundant compilations * Shader Decompiler: Fix vertex attribute upload * Shader compiler: Simplify generated code for reading and faster compilation * Further simplify shader decompiler output * Shader decompiler: More smallen-ing * Shader decompiler: Get PICA uniforms uploaded to the GPU * Shader decompiler: Readd clipping * Shader decompiler: Actually `break` on control flow instructions * Shader decompiler: More control flow handling * Shader decompiler: Fix desitnation mask * Shader Decomp: Remove pair member capture in lambda (unsupported on NDK) * Disgusting changes to handle the fact that hw shader shaders are 2x as big * Shader decompiler: Implement proper output semantic mapping * Moar instructions * Shader decompiler: Add FLR/SLT/SLTI/SGE/SGEI * Shader decompiler: Add register indexing * Shader decompiler: Optimize mova with both x and y masked * Shader decompiler: Add DPH/DPHI * Fix shader caching being broken * PICA decompiler: Cache VS uniforms * Simply vertex cache code * Simplify vertex cache code * Shader decompiler: Add loops * Shader decompiler: Implement safe multiplication * Shader decompiler: Implement LG2/EX2 * Shader decompiler: More control flow * Shader decompiler: Fix JMPU condition * Shader decompiler: Convert main function to void * PICA: Start implementing GPU vertex fetch * More hw VAO work * More hw VAO work * More GPU vertex fetch code * Add GL Stream Buffer from Duckstation * GL: Actually upload data to stream buffers * GPU: Cleanup immediate mode handling * Get first renders working with accelerated draws * Shader decompiler: Fix control flow analysis bugs * HW shaders: Accelerate indexed draws * Shader decompiler: Add support for compilation errors * GLSL decompiler: Fall back for LITP * Add Renderdoc scope classes * Fix control flow analysis bug * HW shaders: Fix attribute fetch * Rewriting hw vertex fetch * Stream buffer: Fix copy-paste mistake * HW shaders: Fix indexed rendering * HW shaders: Add padding attributes * HW shaders: Avoid redundant glVertexAttrib4f calls * HW shaders: Fix loops * HW shaders: Make generated shaders slightly smaller * Fix libretro build * HW shaders: Fix android * Remove redundant ubershader checks * Set accelerate shader default to true * Shader decompiler: Don't declare VS input attributes as an array * Change ubuntu-latest to Ubuntu 24.04 because Microsoft screwed up their CI again * fix merge conflict bug
2024-10-19 16:53:51 +03:00
parent afaf18f124
commit 49a94a13c5
34 changed files with 1877 additions and 253 deletions
--- a/src/core/PICA/shader_gen_glsl.cpp
+++ b/src/core/PICA/shader_gen_glsl.cpp
@@ -1,3 +1,7 @@
+#include <fmt/format.h>
+
+#include <utility>
+
 #include "PICA/pica_frag_config.hpp"
 #include "PICA/regs.hpp"
 #include "PICA/shader_gen.hpp"
@@ -702,6 +706,113 @@ void FragmentGenerator::compileFog(std::string& shader, const PICA::FragmentConf
 	shader += "combinerOutput.rgb = mix(fog_color, combinerOutput.rgb, fog_factor);";
 }

+std::string FragmentGenerator::getVertexShaderAccelerated(const std::string& picaSource, const PICA::VertConfig& vertConfig, bool usingUbershader) {
+	// First, calculate output register -> Fixed function fragment semantics based on the VAO config
+	// This array contains the mappings for the 32 fixed function semantics (8 variables, with 4 lanes each).
+	// Each entry is a pair, containing the output reg to use for this semantic (first) and which lane of that register (second)
+	std::array<std::pair<int, int>, 32> outputMappings{};
+	// Output registers adjusted according to VS_OUTPUT_MASK, which handles enabling and disabling output attributes
+	std::array<u8, 16> vsOutputRegisters;
+
+	{
+		uint count = 0;
+		u16 outputMask = vertConfig.outputMask;
+
+		// See which registers are actually enabled and ignore the disabled ones
+		for (int i = 0; i < 16; i++) {
+			if (outputMask & 1) {
+				vsOutputRegisters[count++] = i;
+			}
+
+			outputMask >>= 1;
+		}
+
+		// For the others, map the index to a vs output directly (TODO: What does hw actually do?)
+		for (; count < 16; count++) {
+			vsOutputRegisters[count] = count;
+		}
+
+		for (int i = 0; i < vertConfig.outputCount; i++) {
+			const u32 config = vertConfig.outmaps[i];
+			for (int j = 0; j < 4; j++) {
+				const u32 mapping = (config >> (j * 8)) & 0x1F;
+				outputMappings[mapping] = std::make_pair(vsOutputRegisters[i], j);
+			}
+		}
+	}
+
+	auto getSemanticName = [&](u32 semanticIndex) {
+		auto [reg, lane] = outputMappings[semanticIndex];
+		return fmt::format("out_regs[{}][{}]", reg, lane);
+	};
+
+	std::string semantics = fmt::format(
+		R"(
+	vec4 a_coords = vec4({}, {}, {}, {});
+	vec4 a_quaternion = vec4({}, {}, {}, {});
+	vec4 a_vertexColour = vec4({}, {}, {}, {});
+	vec2 a_texcoord0 = vec2({}, {});
+	float a_texcoord0_w = {};
+	vec2 a_texcoord1 = vec2({}, {});
+	vec2 a_texcoord2 = vec2({}, {});
+	vec3 a_view = vec3({}, {}, {});
+)",
+		getSemanticName(0), getSemanticName(1), getSemanticName(2), getSemanticName(3), getSemanticName(4), getSemanticName(5), getSemanticName(6),
+		getSemanticName(7), getSemanticName(8), getSemanticName(9), getSemanticName(10), getSemanticName(11), getSemanticName(12),
+		getSemanticName(13), getSemanticName(16), getSemanticName(14), getSemanticName(15), getSemanticName(22), getSemanticName(23),
+		getSemanticName(18), getSemanticName(19), getSemanticName(20)
+	);
+
+	if (usingUbershader) {
+		Helpers::panic("Unimplemented: GetVertexShaderAccelerated for ubershader");
+		return picaSource;
+	} else {
+		// TODO: Uniforms and don't hardcode fixed-function semantic indices...
+		std::string ret = picaSource;
+		if (api == API::GLES) {
+			ret += "\n#define USING_GLES\n";
+		}
+
+		ret += uniformDefinition;
+
+		ret += R"(
+out vec4 v_quaternion;
+out vec4 v_colour;
+out vec3 v_texcoord0;
+out vec2 v_texcoord1;
+out vec3 v_view;
+out vec2 v_texcoord2;
+
+#ifndef USING_GLES
+	out float gl_ClipDistance[2];
+#endif
+
+void main() {
+	pica_shader_main();
+)";
+	// Transfer fixed function fragment registers from vertex shader output to the fragment shader
+	ret += semantics;
+	
+	ret += R"(
+	gl_Position = a_coords;
+	vec4 colourAbs = abs(a_vertexColour);
+	v_colour = min(colourAbs, vec4(1.f));
+
+	v_texcoord0 = vec3(a_texcoord0.x, 1.0 - a_texcoord0.y, a_texcoord0_w);
+	v_texcoord1 = vec2(a_texcoord1.x, 1.0 - a_texcoord1.y);
+	v_texcoord2 = vec2(a_texcoord2.x, 1.0 - a_texcoord2.y);
+	v_view = a_view;
+	v_quaternion = a_quaternion;
+
+#ifndef USING_GLES
+	gl_ClipDistance[0] = -a_coords.z;
+	gl_ClipDistance[1] = dot(clipCoords, a_coords);
+#endif
+})";
+		return ret;
+	}
+}
+
 void FragmentGenerator::compileLogicOps(std::string& shader, const PICA::FragmentConfig& config) {
 	if (api != API::GLES) [[unlikely]] {
 		Helpers::warn("Shadergen: Unsupported API for compileLogicOps");