mirror of https://github.com/godotengine/godot
Optimize raycast occlusion culling
This helps the compiler (gcc at least) to vectorize this loop, which then leads to the compiler emiting packed loads for the vectors. This leads to a modest but measurable improvement in mspf for the TPS demo.
This commit is contained in:
parent
bdf625bd54
commit
bbd67b0e3f
|
|
@ -97,6 +97,8 @@ struct [[nodiscard]] Vector3 {
|
||||||
|
|
||||||
_FORCE_INLINE_ void normalize();
|
_FORCE_INLINE_ void normalize();
|
||||||
_FORCE_INLINE_ Vector3 normalized() const;
|
_FORCE_INLINE_ Vector3 normalized() const;
|
||||||
|
_FORCE_INLINE_ Vector3 normalized_nonzero() const;
|
||||||
|
|
||||||
_FORCE_INLINE_ bool is_normalized() const;
|
_FORCE_INLINE_ bool is_normalized() const;
|
||||||
_FORCE_INLINE_ Vector3 inverse() const;
|
_FORCE_INLINE_ Vector3 inverse() const;
|
||||||
Vector3 limit_length(real_t p_len = 1.0) const;
|
Vector3 limit_length(real_t p_len = 1.0) const;
|
||||||
|
|
@ -512,6 +514,12 @@ void Vector3::normalize() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Vector3 Vector3::normalized_nonzero() const {
|
||||||
|
float lengthsq = x * x + y * y + z * z;
|
||||||
|
float length = Math::sqrt(lengthsq);
|
||||||
|
return Vector3(x / length, y / length, z / length);
|
||||||
|
}
|
||||||
|
|
||||||
Vector3 Vector3::normalized() const {
|
Vector3 Vector3::normalized() const {
|
||||||
Vector3 v = *this;
|
Vector3 v = *this;
|
||||||
v.normalize();
|
v.normalize();
|
||||||
|
|
|
||||||
|
|
@ -281,6 +281,12 @@ struct _GlobalLock {
|
||||||
#define _PRINTF_FORMAT_ATTRIBUTE_2_3
|
#define _PRINTF_FORMAT_ATTRIBUTE_2_3
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(__GNUC__)
|
||||||
|
#define assume_aligned_gd(x, y) __builtin_assume_aligned(x, y)
|
||||||
|
#else
|
||||||
|
#define assume_aligned_gd(x, y) x
|
||||||
|
#endif
|
||||||
|
|
||||||
// This is needed due to a strange OpenGL API that expects a pointer
|
// This is needed due to a strange OpenGL API that expects a pointer
|
||||||
// type for an argument that is actually an offset.
|
// type for an argument that is actually an offset.
|
||||||
#define CAST_INT_TO_UCHAR_PTR(ptr) ((uint8_t *)(uintptr_t)(ptr))
|
#define CAST_INT_TO_UCHAR_PTR(ptr) ((uint8_t *)(uintptr_t)(ptr))
|
||||||
|
|
|
||||||
|
|
@ -33,6 +33,7 @@
|
||||||
#include "core/config/project_settings.h"
|
#include "core/config/project_settings.h"
|
||||||
#include "core/object/worker_thread_pool.h"
|
#include "core/object/worker_thread_pool.h"
|
||||||
#include "core/templates/local_vector.h"
|
#include "core/templates/local_vector.h"
|
||||||
|
#include "core/typedefs.h"
|
||||||
|
|
||||||
#ifdef __SSE2__
|
#ifdef __SSE2__
|
||||||
#include <pmmintrin.h>
|
#include <pmmintrin.h>
|
||||||
|
|
@ -109,11 +110,21 @@ void RaycastOcclusionCull::RaycastHZBuffer::_camera_rays_threaded(uint32_t p_thr
|
||||||
uint32_t total_threads = p_data->thread_count;
|
uint32_t total_threads = p_data->thread_count;
|
||||||
uint32_t from = p_thread * total_tiles / total_threads;
|
uint32_t from = p_thread * total_tiles / total_threads;
|
||||||
uint32_t to = (p_thread + 1 == total_threads) ? total_tiles : ((p_thread + 1) * total_tiles / total_threads);
|
uint32_t to = (p_thread + 1 == total_threads) ? total_tiles : ((p_thread + 1) * total_tiles / total_threads);
|
||||||
_generate_camera_rays(p_data, from, to);
|
|
||||||
|
if (p_data->camera_orthogonal) {
|
||||||
|
_generate_camera_rays<true>(p_data, from, to);
|
||||||
|
} else {
|
||||||
|
_generate_camera_rays<false>(p_data, from, to);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void RaycastOcclusionCull::RaycastHZBuffer::_generate_camera_rays(const CameraRayThreadData *p_data, int p_from, int p_to) {
|
template <bool orthogonal>
|
||||||
const Size2i &buffer_size = sizes[0];
|
void RaycastOcclusionCull::RaycastHZBuffer::_generate_camera_rays(const CameraRayThreadData *p_data, int p_from, int p_to) const {
|
||||||
|
// We are going to be telling the compiler that this pointer is aligned for packed loads.
|
||||||
|
DEV_ASSERT((reinterpret_cast<uintptr_t>(p_data) & 15) != 0);
|
||||||
|
|
||||||
|
const Size2i &buffer_size = sizes.ptr()[0];
|
||||||
|
const CameraRayThreadData *data_aligned = (CameraRayThreadData *)assume_aligned_gd(p_data, 16);
|
||||||
|
|
||||||
for (int i = p_from; i < p_to; i++) {
|
for (int i = p_from; i < p_to; i++) {
|
||||||
CameraRayTile &tile = camera_rays[i];
|
CameraRayTile &tile = camera_rays[i];
|
||||||
|
|
@ -126,29 +137,29 @@ void RaycastOcclusionCull::RaycastHZBuffer::_generate_camera_rays(const CameraRa
|
||||||
|
|
||||||
float u = (float(x) + 0.5f) / buffer_size.x;
|
float u = (float(x) + 0.5f) / buffer_size.x;
|
||||||
float v = (float(y) + 0.5f) / buffer_size.y;
|
float v = (float(y) + 0.5f) / buffer_size.y;
|
||||||
Vector3 pixel_pos = p_data->pixel_corner + u * p_data->pixel_u_interp + v * p_data->pixel_v_interp;
|
Vector3 pixel_pos = data_aligned->pixel_corner + u * data_aligned->pixel_u_interp + v * data_aligned->pixel_v_interp;
|
||||||
|
|
||||||
tile.ray.tnear[j] = p_data->z_near;
|
tile.ray.tnear[j] = data_aligned->z_near;
|
||||||
|
|
||||||
Vector3 dir;
|
Vector3 dir;
|
||||||
if (p_data->camera_orthogonal) {
|
if constexpr (orthogonal) {
|
||||||
dir = p_data->camera_dir;
|
dir = data_aligned->camera_dir;
|
||||||
tile.ray.org_x[j] = pixel_pos.x - dir.x * p_data->z_near;
|
tile.ray.org_x[j] = pixel_pos.x - dir.x * data_aligned->z_near;
|
||||||
tile.ray.org_y[j] = pixel_pos.y - dir.y * p_data->z_near;
|
tile.ray.org_y[j] = pixel_pos.y - dir.y * data_aligned->z_near;
|
||||||
tile.ray.org_z[j] = pixel_pos.z - dir.z * p_data->z_near;
|
tile.ray.org_z[j] = pixel_pos.z - dir.z * data_aligned->z_near;
|
||||||
} else {
|
} else {
|
||||||
dir = (pixel_pos - p_data->camera_pos).normalized();
|
dir = (pixel_pos - data_aligned->camera_pos).normalized_nonzero();
|
||||||
tile.ray.org_x[j] = p_data->camera_pos.x;
|
tile.ray.org_x[j] = data_aligned->camera_pos.x;
|
||||||
tile.ray.org_y[j] = p_data->camera_pos.y;
|
tile.ray.org_y[j] = data_aligned->camera_pos.y;
|
||||||
tile.ray.org_z[j] = p_data->camera_pos.z;
|
tile.ray.org_z[j] = data_aligned->camera_pos.z;
|
||||||
tile.ray.tnear[j] /= dir.dot(p_data->camera_dir);
|
tile.ray.tnear[j] /= dir.dot(data_aligned->camera_dir);
|
||||||
}
|
}
|
||||||
|
|
||||||
tile.ray.dir_x[j] = dir.x;
|
tile.ray.dir_x[j] = dir.x;
|
||||||
tile.ray.dir_y[j] = dir.y;
|
tile.ray.dir_y[j] = dir.y;
|
||||||
tile.ray.dir_z[j] = dir.z;
|
tile.ray.dir_z[j] = dir.z;
|
||||||
|
|
||||||
tile.ray.tfar[j] = p_data->z_far;
|
tile.ray.tfar[j] = data_aligned->z_far;
|
||||||
tile.ray.time[j] = 0.0f;
|
tile.ray.time[j] = 0.0f;
|
||||||
|
|
||||||
tile.ray.flags[j] = 0;
|
tile.ray.flags[j] = 0;
|
||||||
|
|
|
||||||
|
|
@ -46,21 +46,22 @@ public:
|
||||||
private:
|
private:
|
||||||
Size2i tile_grid_size;
|
Size2i tile_grid_size;
|
||||||
|
|
||||||
struct CameraRayThreadData {
|
struct alignas(16) CameraRayThreadData {
|
||||||
int thread_count;
|
int thread_count;
|
||||||
float z_near;
|
float z_near;
|
||||||
float z_far;
|
float z_far;
|
||||||
Vector3 camera_dir;
|
alignas(16) Vector3 camera_dir;
|
||||||
Vector3 camera_pos;
|
alignas(16) Vector3 camera_pos;
|
||||||
Vector3 pixel_corner;
|
alignas(16) Vector3 pixel_corner;
|
||||||
Vector3 pixel_u_interp;
|
alignas(16) Vector3 pixel_u_interp;
|
||||||
Vector3 pixel_v_interp;
|
alignas(16) Vector3 pixel_v_interp;
|
||||||
bool camera_orthogonal;
|
bool camera_orthogonal;
|
||||||
Size2i buffer_size;
|
Size2i buffer_size;
|
||||||
};
|
};
|
||||||
|
|
||||||
void _camera_rays_threaded(uint32_t p_thread, const CameraRayThreadData *p_data);
|
void _camera_rays_threaded(uint32_t p_thread, const CameraRayThreadData *p_data);
|
||||||
void _generate_camera_rays(const CameraRayThreadData *p_data, int p_from, int p_to);
|
template <bool orthogonal>
|
||||||
|
void _generate_camera_rays(const CameraRayThreadData *p_data, int p_from, int p_to) const;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
unsigned int camera_rays_tile_count = 0;
|
unsigned int camera_rays_tile_count = 0;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue