1
0
Fork 0

Optimize raycast occlusion culling

This helps the compiler (gcc at least) to vectorize this loop, which
then leads to the compiler emiting packed loads for the vectors.

This leads to a modest but measurable improvement in mspf for the TPS
demo.
This commit is contained in:
HP van Braam 2025-01-03 17:12:20 +01:00
parent bdf625bd54
commit bbd67b0e3f
4 changed files with 49 additions and 23 deletions

View File

@ -97,6 +97,8 @@ struct [[nodiscard]] Vector3 {
_FORCE_INLINE_ void normalize();
_FORCE_INLINE_ Vector3 normalized() const;
_FORCE_INLINE_ Vector3 normalized_nonzero() const;
_FORCE_INLINE_ bool is_normalized() const;
_FORCE_INLINE_ Vector3 inverse() const;
Vector3 limit_length(real_t p_len = 1.0) const;
@ -512,6 +514,12 @@ void Vector3::normalize() {
}
}
Vector3 Vector3::normalized_nonzero() const {
float lengthsq = x * x + y * y + z * z;
float length = Math::sqrt(lengthsq);
return Vector3(x / length, y / length, z / length);
}
Vector3 Vector3::normalized() const {
Vector3 v = *this;
v.normalize();

View File

@ -281,6 +281,12 @@ struct _GlobalLock {
#define _PRINTF_FORMAT_ATTRIBUTE_2_3
#endif
#if defined(__GNUC__)
#define assume_aligned_gd(x, y) __builtin_assume_aligned(x, y)
#else
#define assume_aligned_gd(x, y) x
#endif
// This is needed due to a strange OpenGL API that expects a pointer
// type for an argument that is actually an offset.
#define CAST_INT_TO_UCHAR_PTR(ptr) ((uint8_t *)(uintptr_t)(ptr))

View File

@ -33,6 +33,7 @@
#include "core/config/project_settings.h"
#include "core/object/worker_thread_pool.h"
#include "core/templates/local_vector.h"
#include "core/typedefs.h"
#ifdef __SSE2__
#include <pmmintrin.h>
@ -109,11 +110,21 @@ void RaycastOcclusionCull::RaycastHZBuffer::_camera_rays_threaded(uint32_t p_thr
uint32_t total_threads = p_data->thread_count;
uint32_t from = p_thread * total_tiles / total_threads;
uint32_t to = (p_thread + 1 == total_threads) ? total_tiles : ((p_thread + 1) * total_tiles / total_threads);
_generate_camera_rays(p_data, from, to);
if (p_data->camera_orthogonal) {
_generate_camera_rays<true>(p_data, from, to);
} else {
_generate_camera_rays<false>(p_data, from, to);
}
}
void RaycastOcclusionCull::RaycastHZBuffer::_generate_camera_rays(const CameraRayThreadData *p_data, int p_from, int p_to) {
const Size2i &buffer_size = sizes[0];
template <bool orthogonal>
void RaycastOcclusionCull::RaycastHZBuffer::_generate_camera_rays(const CameraRayThreadData *p_data, int p_from, int p_to) const {
// We are going to be telling the compiler that this pointer is aligned for packed loads.
DEV_ASSERT((reinterpret_cast<uintptr_t>(p_data) & 15) != 0);
const Size2i &buffer_size = sizes.ptr()[0];
const CameraRayThreadData *data_aligned = (CameraRayThreadData *)assume_aligned_gd(p_data, 16);
for (int i = p_from; i < p_to; i++) {
CameraRayTile &tile = camera_rays[i];
@ -126,29 +137,29 @@ void RaycastOcclusionCull::RaycastHZBuffer::_generate_camera_rays(const CameraRa
float u = (float(x) + 0.5f) / buffer_size.x;
float v = (float(y) + 0.5f) / buffer_size.y;
Vector3 pixel_pos = p_data->pixel_corner + u * p_data->pixel_u_interp + v * p_data->pixel_v_interp;
Vector3 pixel_pos = data_aligned->pixel_corner + u * data_aligned->pixel_u_interp + v * data_aligned->pixel_v_interp;
tile.ray.tnear[j] = p_data->z_near;
tile.ray.tnear[j] = data_aligned->z_near;
Vector3 dir;
if (p_data->camera_orthogonal) {
dir = p_data->camera_dir;
tile.ray.org_x[j] = pixel_pos.x - dir.x * p_data->z_near;
tile.ray.org_y[j] = pixel_pos.y - dir.y * p_data->z_near;
tile.ray.org_z[j] = pixel_pos.z - dir.z * p_data->z_near;
if constexpr (orthogonal) {
dir = data_aligned->camera_dir;
tile.ray.org_x[j] = pixel_pos.x - dir.x * data_aligned->z_near;
tile.ray.org_y[j] = pixel_pos.y - dir.y * data_aligned->z_near;
tile.ray.org_z[j] = pixel_pos.z - dir.z * data_aligned->z_near;
} else {
dir = (pixel_pos - p_data->camera_pos).normalized();
tile.ray.org_x[j] = p_data->camera_pos.x;
tile.ray.org_y[j] = p_data->camera_pos.y;
tile.ray.org_z[j] = p_data->camera_pos.z;
tile.ray.tnear[j] /= dir.dot(p_data->camera_dir);
dir = (pixel_pos - data_aligned->camera_pos).normalized_nonzero();
tile.ray.org_x[j] = data_aligned->camera_pos.x;
tile.ray.org_y[j] = data_aligned->camera_pos.y;
tile.ray.org_z[j] = data_aligned->camera_pos.z;
tile.ray.tnear[j] /= dir.dot(data_aligned->camera_dir);
}
tile.ray.dir_x[j] = dir.x;
tile.ray.dir_y[j] = dir.y;
tile.ray.dir_z[j] = dir.z;
tile.ray.tfar[j] = p_data->z_far;
tile.ray.tfar[j] = data_aligned->z_far;
tile.ray.time[j] = 0.0f;
tile.ray.flags[j] = 0;

View File

@ -46,21 +46,22 @@ public:
private:
Size2i tile_grid_size;
struct CameraRayThreadData {
struct alignas(16) CameraRayThreadData {
int thread_count;
float z_near;
float z_far;
Vector3 camera_dir;
Vector3 camera_pos;
Vector3 pixel_corner;
Vector3 pixel_u_interp;
Vector3 pixel_v_interp;
alignas(16) Vector3 camera_dir;
alignas(16) Vector3 camera_pos;
alignas(16) Vector3 pixel_corner;
alignas(16) Vector3 pixel_u_interp;
alignas(16) Vector3 pixel_v_interp;
bool camera_orthogonal;
Size2i buffer_size;
};
void _camera_rays_threaded(uint32_t p_thread, const CameraRayThreadData *p_data);
void _generate_camera_rays(const CameraRayThreadData *p_data, int p_from, int p_to);
template <bool orthogonal>
void _generate_camera_rays(const CameraRayThreadData *p_data, int p_from, int p_to) const;
public:
unsigned int camera_rays_tile_count = 0;