diff --git a/core/math/vector3.h b/core/math/vector3.h index fd0dec3550d..977b195bf2a 100644 --- a/core/math/vector3.h +++ b/core/math/vector3.h @@ -97,6 +97,8 @@ struct [[nodiscard]] Vector3 { _FORCE_INLINE_ void normalize(); _FORCE_INLINE_ Vector3 normalized() const; + _FORCE_INLINE_ Vector3 normalized_nonzero() const; + _FORCE_INLINE_ bool is_normalized() const; _FORCE_INLINE_ Vector3 inverse() const; Vector3 limit_length(real_t p_len = 1.0) const; @@ -512,6 +514,12 @@ void Vector3::normalize() { } } +Vector3 Vector3::normalized_nonzero() const { + float lengthsq = x * x + y * y + z * z; + float length = Math::sqrt(lengthsq); + return Vector3(x / length, y / length, z / length); +} + Vector3 Vector3::normalized() const { Vector3 v = *this; v.normalize(); diff --git a/core/typedefs.h b/core/typedefs.h index b5b44f21691..f9b976b2226 100644 --- a/core/typedefs.h +++ b/core/typedefs.h @@ -281,6 +281,12 @@ struct _GlobalLock { #define _PRINTF_FORMAT_ATTRIBUTE_2_3 #endif +#if defined(__GNUC__) +#define assume_aligned_gd(x, y) __builtin_assume_aligned(x, y) +#else +#define assume_aligned_gd(x, y) x +#endif + // This is needed due to a strange OpenGL API that expects a pointer // type for an argument that is actually an offset. #define CAST_INT_TO_UCHAR_PTR(ptr) ((uint8_t *)(uintptr_t)(ptr)) diff --git a/modules/raycast/raycast_occlusion_cull.cpp b/modules/raycast/raycast_occlusion_cull.cpp index 9f95307cf1c..317bd60d5a2 100644 --- a/modules/raycast/raycast_occlusion_cull.cpp +++ b/modules/raycast/raycast_occlusion_cull.cpp @@ -33,6 +33,7 @@ #include "core/config/project_settings.h" #include "core/object/worker_thread_pool.h" #include "core/templates/local_vector.h" +#include "core/typedefs.h" #ifdef __SSE2__ #include @@ -109,11 +110,21 @@ void RaycastOcclusionCull::RaycastHZBuffer::_camera_rays_threaded(uint32_t p_thr uint32_t total_threads = p_data->thread_count; uint32_t from = p_thread * total_tiles / total_threads; uint32_t to = (p_thread + 1 == total_threads) ? total_tiles : ((p_thread + 1) * total_tiles / total_threads); - _generate_camera_rays(p_data, from, to); + + if (p_data->camera_orthogonal) { + _generate_camera_rays(p_data, from, to); + } else { + _generate_camera_rays(p_data, from, to); + } } -void RaycastOcclusionCull::RaycastHZBuffer::_generate_camera_rays(const CameraRayThreadData *p_data, int p_from, int p_to) { - const Size2i &buffer_size = sizes[0]; +template +void RaycastOcclusionCull::RaycastHZBuffer::_generate_camera_rays(const CameraRayThreadData *p_data, int p_from, int p_to) const { + // We are going to be telling the compiler that this pointer is aligned for packed loads. + DEV_ASSERT((reinterpret_cast(p_data) & 15) != 0); + + const Size2i &buffer_size = sizes.ptr()[0]; + const CameraRayThreadData *data_aligned = (CameraRayThreadData *)assume_aligned_gd(p_data, 16); for (int i = p_from; i < p_to; i++) { CameraRayTile &tile = camera_rays[i]; @@ -126,29 +137,29 @@ void RaycastOcclusionCull::RaycastHZBuffer::_generate_camera_rays(const CameraRa float u = (float(x) + 0.5f) / buffer_size.x; float v = (float(y) + 0.5f) / buffer_size.y; - Vector3 pixel_pos = p_data->pixel_corner + u * p_data->pixel_u_interp + v * p_data->pixel_v_interp; + Vector3 pixel_pos = data_aligned->pixel_corner + u * data_aligned->pixel_u_interp + v * data_aligned->pixel_v_interp; - tile.ray.tnear[j] = p_data->z_near; + tile.ray.tnear[j] = data_aligned->z_near; Vector3 dir; - if (p_data->camera_orthogonal) { - dir = p_data->camera_dir; - tile.ray.org_x[j] = pixel_pos.x - dir.x * p_data->z_near; - tile.ray.org_y[j] = pixel_pos.y - dir.y * p_data->z_near; - tile.ray.org_z[j] = pixel_pos.z - dir.z * p_data->z_near; + if constexpr (orthogonal) { + dir = data_aligned->camera_dir; + tile.ray.org_x[j] = pixel_pos.x - dir.x * data_aligned->z_near; + tile.ray.org_y[j] = pixel_pos.y - dir.y * data_aligned->z_near; + tile.ray.org_z[j] = pixel_pos.z - dir.z * data_aligned->z_near; } else { - dir = (pixel_pos - p_data->camera_pos).normalized(); - tile.ray.org_x[j] = p_data->camera_pos.x; - tile.ray.org_y[j] = p_data->camera_pos.y; - tile.ray.org_z[j] = p_data->camera_pos.z; - tile.ray.tnear[j] /= dir.dot(p_data->camera_dir); + dir = (pixel_pos - data_aligned->camera_pos).normalized_nonzero(); + tile.ray.org_x[j] = data_aligned->camera_pos.x; + tile.ray.org_y[j] = data_aligned->camera_pos.y; + tile.ray.org_z[j] = data_aligned->camera_pos.z; + tile.ray.tnear[j] /= dir.dot(data_aligned->camera_dir); } tile.ray.dir_x[j] = dir.x; tile.ray.dir_y[j] = dir.y; tile.ray.dir_z[j] = dir.z; - tile.ray.tfar[j] = p_data->z_far; + tile.ray.tfar[j] = data_aligned->z_far; tile.ray.time[j] = 0.0f; tile.ray.flags[j] = 0; diff --git a/modules/raycast/raycast_occlusion_cull.h b/modules/raycast/raycast_occlusion_cull.h index f5b7011abb8..1f69750572d 100644 --- a/modules/raycast/raycast_occlusion_cull.h +++ b/modules/raycast/raycast_occlusion_cull.h @@ -46,21 +46,22 @@ public: private: Size2i tile_grid_size; - struct CameraRayThreadData { + struct alignas(16) CameraRayThreadData { int thread_count; float z_near; float z_far; - Vector3 camera_dir; - Vector3 camera_pos; - Vector3 pixel_corner; - Vector3 pixel_u_interp; - Vector3 pixel_v_interp; + alignas(16) Vector3 camera_dir; + alignas(16) Vector3 camera_pos; + alignas(16) Vector3 pixel_corner; + alignas(16) Vector3 pixel_u_interp; + alignas(16) Vector3 pixel_v_interp; bool camera_orthogonal; Size2i buffer_size; }; void _camera_rays_threaded(uint32_t p_thread, const CameraRayThreadData *p_data); - void _generate_camera_rays(const CameraRayThreadData *p_data, int p_from, int p_to); + template + void _generate_camera_rays(const CameraRayThreadData *p_data, int p_from, int p_to) const; public: unsigned int camera_rays_tile_count = 0;