Optimize raycast occlusion culling

This helps the compiler (gcc at least) to vectorize this loop, which then leads to the compiler emiting packed loads for the vectors. This leads to a modest but measurable improvement in mspf for the TPS demo.
2025-01-03 17:12:20 +01:00 · 2025-01-03 17:12:20 +01:00 · bbd67b0e3f
parent bdf625bd54
commit bbd67b0e3f
4 changed files with 49 additions and 23 deletions
--- a/core/math/vector3.h
+++ b/core/math/vector3.h
@ -97,6 +97,8 @@ struct [[nodiscard]] Vector3 {
 	_FORCE_INLINE_ void normalize();
 	_FORCE_INLINE_ Vector3 normalized() const;
 	_FORCE_INLINE_ Vector3 normalized_nonzero() const;
 	_FORCE_INLINE_ bool is_normalized() const;
 	_FORCE_INLINE_ Vector3 inverse() const;
 	Vector3 limit_length(real_t p_len = 1.0) const;
@ -512,6 +514,12 @@ void Vector3::normalize() {
 	}
 }
 Vector3 Vector3::normalized_nonzero() const {
 	float lengthsq = x * x + y * y + z * z;
 	float length = Math::sqrt(lengthsq);
 	return Vector3(x / length, y / length, z / length);
 }
 Vector3 Vector3::normalized() const {
 	Vector3 v = *this;
 	v.normalize();
--- a/core/typedefs.h
+++ b/core/typedefs.h
@ -281,6 +281,12 @@ struct _GlobalLock {
 #define _PRINTF_FORMAT_ATTRIBUTE_2_3
 #endif
 #if defined(__GNUC__)
 #define assume_aligned_gd(x, y) __builtin_assume_aligned(x, y)
 #else
 #define assume_aligned_gd(x, y) x
 #endif
 // This is needed due to a strange OpenGL API that expects a pointer
 // type for an argument that is actually an offset.
 #define CAST_INT_TO_UCHAR_PTR(ptr) ((uint8_t *)(uintptr_t)(ptr))
--- a/modules/raycast/raycast_occlusion_cull.cpp
+++ b/modules/raycast/raycast_occlusion_cull.cpp
@ -33,6 +33,7 @@
 #include "core/config/project_settings.h"
 #include "core/object/worker_thread_pool.h"
 #include "core/templates/local_vector.h"
 #include "core/typedefs.h"
 #ifdef __SSE2__
 #include <pmmintrin.h>
@ -109,11 +110,21 @@ void RaycastOcclusionCull::RaycastHZBuffer::_camera_rays_threaded(uint32_t p_thr
 	uint32_t total_threads = p_data->thread_count;
 	uint32_t from = p_thread * total_tiles / total_threads;
 	uint32_t to = (p_thread + 1 == total_threads) ? total_tiles : ((p_thread + 1) * total_tiles / total_threads);
-	_generate_camera_rays(p_data, from, to);
+
 	if (p_data->camera_orthogonal) {
 		_generate_camera_rays<true>(p_data, from, to);
 	} else {
 		_generate_camera_rays<false>(p_data, from, to);
 	}
 }
-void RaycastOcclusionCull::RaycastHZBuffer::_generate_camera_rays(const CameraRayThreadData *p_data, int p_from, int p_to) {
+template <bool orthogonal>
-	const Size2i &buffer_size = sizes[0];
+void RaycastOcclusionCull::RaycastHZBuffer::_generate_camera_rays(const CameraRayThreadData *p_data, int p_from, int p_to) const {
 	// We are going to be telling the compiler that this pointer is aligned for packed loads.
 	DEV_ASSERT((reinterpret_cast<uintptr_t>(p_data) & 15) != 0);
 	const Size2i &buffer_size = sizes.ptr()[0];
 	const CameraRayThreadData *data_aligned = (CameraRayThreadData *)assume_aligned_gd(p_data, 16);
 	for (int i = p_from; i < p_to; i++) {
 		CameraRayTile &tile = camera_rays[i];
@ -126,29 +137,29 @@ void RaycastOcclusionCull::RaycastHZBuffer::_generate_camera_rays(const CameraRa
 			float u = (float(x) + 0.5f) / buffer_size.x;
 			float v = (float(y) + 0.5f) / buffer_size.y;
-			Vector3 pixel_pos = p_data->pixel_corner + u * p_data->pixel_u_interp + v * p_data->pixel_v_interp;
+			Vector3 pixel_pos = data_aligned->pixel_corner + u * data_aligned->pixel_u_interp + v * data_aligned->pixel_v_interp;
-			tile.ray.tnear[j] = p_data->z_near;
+			tile.ray.tnear[j] = data_aligned->z_near;
 			Vector3 dir;
-			if (p_data->camera_orthogonal) {
+			if constexpr (orthogonal) {
-				dir = p_data->camera_dir;
+				dir = data_aligned->camera_dir;
-				tile.ray.org_x[j] = pixel_pos.x - dir.x * p_data->z_near;
+				tile.ray.org_x[j] = pixel_pos.x - dir.x * data_aligned->z_near;
-				tile.ray.org_y[j] = pixel_pos.y - dir.y * p_data->z_near;
+				tile.ray.org_y[j] = pixel_pos.y - dir.y * data_aligned->z_near;
-				tile.ray.org_z[j] = pixel_pos.z - dir.z * p_data->z_near;
+				tile.ray.org_z[j] = pixel_pos.z - dir.z * data_aligned->z_near;
 			} else {
-				dir = (pixel_pos - p_data->camera_pos).normalized();
+				dir = (pixel_pos - data_aligned->camera_pos).normalized_nonzero();
-				tile.ray.org_x[j] = p_data->camera_pos.x;
+				tile.ray.org_x[j] = data_aligned->camera_pos.x;
-				tile.ray.org_y[j] = p_data->camera_pos.y;
+				tile.ray.org_y[j] = data_aligned->camera_pos.y;
-				tile.ray.org_z[j] = p_data->camera_pos.z;
+				tile.ray.org_z[j] = data_aligned->camera_pos.z;
-				tile.ray.tnear[j] /= dir.dot(p_data->camera_dir);
+				tile.ray.tnear[j] /= dir.dot(data_aligned->camera_dir);
 			}
 			tile.ray.dir_x[j] = dir.x;
 			tile.ray.dir_y[j] = dir.y;
 			tile.ray.dir_z[j] = dir.z;
-			tile.ray.tfar[j] = p_data->z_far;
+			tile.ray.tfar[j] = data_aligned->z_far;
 			tile.ray.time[j] = 0.0f;
 			tile.ray.flags[j] = 0;
--- a/modules/raycast/raycast_occlusion_cull.h
+++ b/modules/raycast/raycast_occlusion_cull.h
@ -46,21 +46,22 @@ public:
 	private:
 		Size2i tile_grid_size;
-		struct CameraRayThreadData {
+		struct alignas(16) CameraRayThreadData {
 			int thread_count;
 			float z_near;
 			float z_far;
-			Vector3 camera_dir;
+			alignas(16) Vector3 camera_dir;
-			Vector3 camera_pos;
+			alignas(16) Vector3 camera_pos;
-			Vector3 pixel_corner;
+			alignas(16) Vector3 pixel_corner;
-			Vector3 pixel_u_interp;
+			alignas(16) Vector3 pixel_u_interp;
-			Vector3 pixel_v_interp;
+			alignas(16) Vector3 pixel_v_interp;
 			bool camera_orthogonal;
 			Size2i buffer_size;
 		};
 		void _camera_rays_threaded(uint32_t p_thread, const CameraRayThreadData *p_data);
-		void _generate_camera_rays(const CameraRayThreadData *p_data, int p_from, int p_to);
+		template <bool orthogonal>
 		void _generate_camera_rays(const CameraRayThreadData *p_data, int p_from, int p_to) const;
 	public:
 		unsigned int camera_rays_tile_count = 0;