From f99d0947782eba6794b6a39eedcaced21f1564a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Verschelde?= Date: Sat, 13 Dec 2025 10:38:34 +0100 Subject: [PATCH] meshoptimizer: Update to 1.0 --- thirdparty/README.md | 2 +- thirdparty/meshoptimizer/clusterizer.cpp | 287 ++++++++++++++-------- thirdparty/meshoptimizer/meshoptimizer.h | 116 +++++---- thirdparty/meshoptimizer/partition.cpp | 269 ++++++++++++++------ thirdparty/meshoptimizer/simplifier.cpp | 140 ++++++++--- thirdparty/meshoptimizer/spatialorder.cpp | 1 + thirdparty/meshoptimizer/vertexcodec.cpp | 2 +- thirdparty/meshoptimizer/vertexfilter.cpp | 208 +++++++++------- 8 files changed, 667 insertions(+), 358 deletions(-) diff --git a/thirdparty/README.md b/thirdparty/README.md index 62fe3e0f573..52027b1840f 100644 --- a/thirdparty/README.md +++ b/thirdparty/README.md @@ -695,7 +695,7 @@ Patches: ## meshoptimizer - Upstream: https://github.com/zeux/meshoptimizer -- Version: 0.25 (6daea4695c48338363b08022d2fb15deaef6ac09, 2025) +- Version: 1.0 (73583c335e541c139821d0de2bf5f12960a04941, 2025) - License: MIT Files extracted from upstream repository: diff --git a/thirdparty/meshoptimizer/clusterizer.cpp b/thirdparty/meshoptimizer/clusterizer.cpp index be7478d388e..73cc0ab535e 100644 --- a/thirdparty/meshoptimizer/clusterizer.cpp +++ b/thirdparty/meshoptimizer/clusterizer.cpp @@ -6,6 +6,17 @@ #include #include +// The block below auto-detects SIMD ISA that can be used on the target platform +#ifndef MESHOPTIMIZER_NO_SIMD +#if defined(__SSE2__) || (defined(_MSC_VER) && defined(_M_X64)) +#define SIMD_SSE +#include +#elif defined(__aarch64__) || (defined(_MSC_VER) && defined(_M_ARM64) && _MSC_VER >= 1922) +#define SIMD_NEON +#include +#endif +#endif // !MESHOPTIMIZER_NO_SIMD + // This work is based on: // Graham Wihlidal. Optimizing the Graphics Pipeline with Compute. 2016 // Matthaeus Chajdas. GeometryFX 1.2 - Cluster Culling. 2016 @@ -149,6 +160,19 @@ static void buildTriangleAdjacencySparse(TriangleAdjacency2& adjacency, const un } } +static void clearUsed(short* used, size_t vertex_count, const unsigned int* indices, size_t index_count) +{ + // for sparse inputs, it's faster to only clear vertices referenced by the index buffer + if (vertex_count <= index_count) + memset(used, -1, vertex_count * sizeof(short)); + else + for (size_t i = 0; i < index_count; ++i) + { + assert(indices[i] < vertex_count); + used[indices[i]] = -1; + } +} + static void computeBoundingSphere(float result[4], const float* points, size_t count, size_t points_stride, const float* radii, size_t radii_stride, size_t axis_count) { static const float kAxes[7][3] = { @@ -265,21 +289,8 @@ struct Cone float nx, ny, nz; }; -static float getDistance(float dx, float dy, float dz, bool aa) -{ - if (!aa) - return sqrtf(dx * dx + dy * dy + dz * dz); - - float rx = fabsf(dx), ry = fabsf(dy), rz = fabsf(dz); - float rxy = rx > ry ? rx : ry; - return rxy > rz ? rxy : rz; -} - static float getMeshletScore(float distance, float spread, float cone_weight, float expected_radius) { - if (cone_weight < 0) - return 1 + distance / expected_radius; - float cone = 1.f - spread * cone_weight; float cone_clamped = cone < 1e-3f ? 1e-3f : cone; @@ -348,15 +359,6 @@ static float computeTriangleCones(Cone* triangles, const unsigned int* indices, return mesh_area; } -static void finishMeshlet(meshopt_Meshlet& meshlet, unsigned char* meshlet_triangles) -{ - size_t offset = meshlet.triangle_offset + meshlet.triangle_count * 3; - - // fill 4b padding with 0 - while (offset & 3) - meshlet_triangles[offset++] = 0; -} - static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int b, unsigned int c, short* used, meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t meshlet_offset, size_t max_vertices, size_t max_triangles, bool split = false) { short& av = used[a]; @@ -374,10 +376,8 @@ static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int for (size_t j = 0; j < meshlet.vertex_count; ++j) used[meshlet_vertices[meshlet.vertex_offset + j]] = -1; - finishMeshlet(meshlet, meshlet_triangles); - meshlet.vertex_offset += meshlet.vertex_count; - meshlet.triangle_offset += (meshlet.triangle_count * 3 + 3) & ~3; // 4b padding + meshlet.triangle_offset += meshlet.triangle_count * 3; meshlet.vertex_count = 0; meshlet.triangle_count = 0; @@ -453,7 +453,7 @@ static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Co const Cone& tri_cone = triangles[triangle]; float dx = tri_cone.px - meshlet_cone.px, dy = tri_cone.py - meshlet_cone.py, dz = tri_cone.pz - meshlet_cone.pz; - float distance = getDistance(dx, dy, dz, cone_weight < 0); + float distance = sqrtf(dx * dx + dy * dy + dz * dz); float spread = tri_cone.nx * meshlet_cone.nx + tri_cone.ny * meshlet_cone.ny + tri_cone.nz * meshlet_cone.nz; float score = getMeshletScore(distance, spread, cone_weight, meshlet_expected_radius); @@ -514,7 +514,8 @@ static size_t appendSeedTriangles(unsigned int* seeds, const meshopt_Meshlet& me if (best_neighbor == ~0u) continue; - float best_neighbor_score = getDistance(triangles[best_neighbor].px - cornerx, triangles[best_neighbor].py - cornery, triangles[best_neighbor].pz - cornerz, false); + float dx = triangles[best_neighbor].px - cornerx, dy = triangles[best_neighbor].py - cornery, dz = triangles[best_neighbor].pz - cornerz; + float best_neighbor_score = sqrtf(dx * dx + dy * dy + dz * dz); for (size_t j = 0; j < kMeshletAddSeeds; ++j) { @@ -566,7 +567,8 @@ static unsigned int selectSeedTriangle(const unsigned int* seeds, size_t seed_co unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2]; unsigned int live = live_triangles[a] + live_triangles[b] + live_triangles[c]; - float score = getDistance(triangles[index].px - cornerx, triangles[index].py - cornery, triangles[index].pz - cornerz, false); + float dx = triangles[index].px - cornerx, dy = triangles[index].py - cornery, dz = triangles[index].pz - cornerz; + float score = sqrtf(dx * dx + dy * dy + dz * dz); if (live < best_live || (live == best_live && score < best_score)) { @@ -587,13 +589,13 @@ struct KDNode unsigned int index; }; - // leaves: axis = 3, children = number of extra points after this one (0 if 'index' is the only point) + // leaves: axis = 3, children = number of points including this one // branches: axis != 3, left subtree = skip 1, right subtree = skip 1+children unsigned int axis : 2; unsigned int children : 30; }; -static size_t kdtreePartition(unsigned int* indices, size_t count, const float* points, size_t stride, unsigned int axis, float pivot) +static size_t kdtreePartition(unsigned int* indices, size_t count, const float* points, size_t stride, int axis, float pivot) { size_t m = 0; @@ -623,7 +625,7 @@ static size_t kdtreeBuildLeaf(size_t offset, KDNode* nodes, size_t node_count, u result.index = indices[0]; result.axis = 3; - result.children = unsigned(count - 1); + result.children = unsigned(count); // all remaining points are stored in nodes immediately following the leaf for (size_t i = 1; i < count; ++i) @@ -638,7 +640,7 @@ static size_t kdtreeBuildLeaf(size_t offset, KDNode* nodes, size_t node_count, u return offset + count; } -static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const float* points, size_t stride, unsigned int* indices, size_t count, size_t leaf_size) +static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const float* points, size_t stride, unsigned int* indices, size_t count, size_t leaf_size, int depth) { assert(count > 0); assert(offset < node_count); @@ -664,13 +666,14 @@ static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const } // split axis is one where the variance is largest - unsigned int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2); + int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2); float split = mean[axis]; size_t middle = kdtreePartition(indices, count, points, stride, axis, split); // when the partition is degenerate simply consolidate the points into a single node - if (middle <= leaf_size / 2 || middle >= count - leaf_size / 2) + // this also ensures recursion depth is bounded on pathological inputs + if (middle <= leaf_size / 2 || middle >= count - leaf_size / 2 || depth >= kMeshletMaxTreeDepth) return kdtreeBuildLeaf(offset, nodes, node_count, indices, count); KDNode& result = nodes[offset]; @@ -679,32 +682,40 @@ static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const result.axis = axis; // left subtree is right after our node - size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size); + size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size, depth + 1); // distance to the right subtree is represented explicitly + assert(next_offset - offset > 1); result.children = unsigned(next_offset - offset - 1); - return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size); + return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size, depth + 1); } -static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, size_t stride, const unsigned char* emitted_flags, const float* position, bool aa, unsigned int& result, float& limit) +static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, size_t stride, const unsigned char* emitted_flags, const float* position, unsigned int& result, float& limit) { const KDNode& node = nodes[root]; + if (node.children == 0) + return; + if (node.axis == 3) { // leaf - for (unsigned int i = 0; i <= node.children; ++i) + bool inactive = true; + + for (unsigned int i = 0; i < node.children; ++i) { unsigned int index = nodes[root + i].index; if (emitted_flags[index]) continue; + inactive = false; + const float* point = points + index * stride; float dx = point[0] - position[0], dy = point[1] - position[1], dz = point[2] - position[2]; - float distance = getDistance(dx, dy, dz, aa); + float distance = sqrtf(dx * dx + dy * dy + dz * dz); if (distance < limit) { @@ -712,6 +723,10 @@ static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, limit = distance; } } + + // deactivate leaves that no longer have items to emit + if (inactive) + nodes[root].children = 0; } else { @@ -720,34 +735,87 @@ static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, unsigned int first = (delta <= 0) ? 0 : node.children; unsigned int second = first ^ node.children; - kdtreeNearest(nodes, root + 1 + first, points, stride, emitted_flags, position, aa, result, limit); + // deactivate branches that no longer have items to emit to accelerate traversal + // note that we do this *before* recursing which delays deactivation but keeps tail calls + if ((nodes[root + 1 + first].children | nodes[root + 1 + second].children) == 0) + nodes[root].children = 0; + + // recursion depth is bounded by tree depth (which is limited by construction) + kdtreeNearest(nodes, root + 1 + first, points, stride, emitted_flags, position, result, limit); // only process the other node if it can have a match based on closest distance so far if (fabsf(delta) <= limit) - kdtreeNearest(nodes, root + 1 + second, points, stride, emitted_flags, position, aa, result, limit); + kdtreeNearest(nodes, root + 1 + second, points, stride, emitted_flags, position, result, limit); } } +struct BVHBoxT +{ + float min[4]; + float max[4]; +}; + struct BVHBox { float min[3]; float max[3]; }; -static void boxMerge(BVHBox& box, const BVHBox& other) +#if defined(SIMD_SSE) +static float boxMerge(BVHBoxT& box, const BVHBox& other) +{ + __m128 min = _mm_loadu_ps(box.min); + __m128 max = _mm_loadu_ps(box.max); + + // note: over-read is safe because BVHBox array is allocated with padding + min = _mm_min_ps(min, _mm_loadu_ps(other.min)); + max = _mm_max_ps(max, _mm_loadu_ps(other.max)); + + _mm_storeu_ps(box.min, min); + _mm_storeu_ps(box.max, max); + + __m128 size = _mm_sub_ps(max, min); + __m128 size_yzx = _mm_shuffle_ps(size, size, _MM_SHUFFLE(0, 0, 2, 1)); + __m128 mul = _mm_mul_ps(size, size_yzx); + __m128 sum_xy = _mm_add_ss(mul, _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(1, 1, 1, 1))); + __m128 sum_xyz = _mm_add_ss(sum_xy, _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 2, 2, 2))); + + return _mm_cvtss_f32(sum_xyz); +} +#elif defined(SIMD_NEON) +static float boxMerge(BVHBoxT& box, const BVHBox& other) +{ + float32x4_t min = vld1q_f32(box.min); + float32x4_t max = vld1q_f32(box.max); + + // note: over-read is safe because BVHBox array is allocated with padding + min = vminq_f32(min, vld1q_f32(other.min)); + max = vmaxq_f32(max, vld1q_f32(other.max)); + + vst1q_f32(box.min, min); + vst1q_f32(box.max, max); + + float32x4_t size = vsubq_f32(max, min); + float32x4_t size_yzx = vextq_f32(vextq_f32(size, size, 3), size, 2); + float32x4_t mul = vmulq_f32(size, size_yzx); + float sum_xy = vgetq_lane_f32(mul, 0) + vgetq_lane_f32(mul, 1); + float sum_xyz = sum_xy + vgetq_lane_f32(mul, 2); + + return sum_xyz; +} +#else +static float boxMerge(BVHBoxT& box, const BVHBox& other) { for (int k = 0; k < 3; ++k) { box.min[k] = other.min[k] < box.min[k] ? other.min[k] : box.min[k]; box.max[k] = other.max[k] > box.max[k] ? other.max[k] : box.max[k]; } -} -inline float boxSurface(const BVHBox& box) -{ float sx = box.max[0] - box.min[0], sy = box.max[1] - box.min[1], sz = box.max[2] - box.min[2]; return sx * sy + sx * sz + sy * sz; } +#endif inline unsigned int radixFloat(unsigned int v) { @@ -833,7 +901,7 @@ static void bvhPrepare(BVHBox* boxes, float* centroids, const unsigned int* indi } } -static bool bvhPackLeaf(unsigned char* boundary, const unsigned int* order, size_t count, short* used, const unsigned int* indices, size_t max_vertices) +static size_t bvhCountVertices(const unsigned int* order, size_t count, short* used, const unsigned int* indices, unsigned int* out = NULL) { // count number of unique vertices size_t used_vertices = 0; @@ -844,6 +912,9 @@ static bool bvhPackLeaf(unsigned char* boundary, const unsigned int* order, size used_vertices += (used[a] < 0) + (used[b] < 0) + (used[c] < 0); used[a] = used[b] = used[c] = 1; + + if (out) + out[i] = unsigned(used_vertices); } // reset used[] for future invocations @@ -855,16 +926,16 @@ static bool bvhPackLeaf(unsigned char* boundary, const unsigned int* order, size used[a] = used[b] = used[c] = -1; } - if (used_vertices > max_vertices) - return false; + return used_vertices; +} +static void bvhPackLeaf(unsigned char* boundary, size_t count) +{ // mark meshlet boundary for future reassembly assert(count > 0); boundary[0] = 1; memset(boundary + 1, 0, count - 1); - - return true; } static void bvhPackTail(unsigned char* boundary, const unsigned int* order, size_t count, short* used, const unsigned int* indices, size_t max_vertices, size_t max_triangles) @@ -873,8 +944,9 @@ static void bvhPackTail(unsigned char* boundary, const unsigned int* order, size { size_t chunk = i + max_triangles <= count ? max_triangles : count - i; - if (bvhPackLeaf(boundary + i, order + i, chunk, used, indices, max_vertices)) + if (bvhCountVertices(order + i, chunk, used, indices) <= max_vertices) { + bvhPackLeaf(boundary + i, chunk); i += chunk; continue; } @@ -882,7 +954,7 @@ static void bvhPackTail(unsigned char* boundary, const unsigned int* order, size // chunk is vertex bound, split it into smaller meshlets assert(chunk > max_vertices / 3); - bvhPackLeaf(boundary + i, order + i, max_vertices / 3, used, indices, max_vertices); + bvhPackLeaf(boundary + i, max_vertices / 3); i += max_vertices / 3; } } @@ -895,25 +967,27 @@ static bool bvhDivisible(size_t count, size_t min, size_t max) return min * 2 <= max ? count >= min : count % min <= (count / min) * (max - min); } -static size_t bvhPivot(const BVHBox* boxes, const unsigned int* order, size_t count, void* scratch, size_t step, size_t min, size_t max, float fill, float* out_cost) +static void bvhComputeArea(float* areas, const BVHBox* boxes, const unsigned int* order, size_t count) { - BVHBox accuml = boxes[order[0]], accumr = boxes[order[count - 1]]; - float* costs = static_cast(scratch); + BVHBoxT accuml = {{FLT_MAX, FLT_MAX, FLT_MAX, 0}, {-FLT_MAX, -FLT_MAX, -FLT_MAX, 0}}; + BVHBoxT accumr = accuml; - // accumulate SAH cost in forward and backward directions for (size_t i = 0; i < count; ++i) { - boxMerge(accuml, boxes[order[i]]); - boxMerge(accumr, boxes[order[count - 1 - i]]); + float larea = boxMerge(accuml, boxes[order[i]]); + float rarea = boxMerge(accumr, boxes[order[count - 1 - i]]); - costs[i] = boxSurface(accuml); - costs[i + count] = boxSurface(accumr); + areas[i] = larea; + areas[i + count] = rarea; } +} +static size_t bvhPivot(const float* areas, const unsigned int* vertices, size_t count, size_t step, size_t min, size_t max, float fill, size_t maxfill, float* out_cost) +{ bool aligned = count >= min * 2 && bvhDivisible(count, min, max); size_t end = aligned ? count - min : count - 1; - float rmaxf = 1.f / float(int(max)); + float rmaxfill = 1.f / float(int(maxfill)); // find best split that minimizes SAH size_t bestsplit = 0; @@ -928,17 +1002,22 @@ static size_t bvhPivot(const BVHBox* boxes, const unsigned int* order, size_t co if (aligned && !bvhDivisible(rsplit, min, max)) continue; - // costs[x] = inclusive surface area of boxes[0..x] - // costs[count-1-x] = inclusive surface area of boxes[x..count-1] - float larea = costs[i], rarea = costs[(count - 1 - (i + 1)) + count]; + // areas[x] = inclusive surface area of boxes[0..x] + // areas[count-1-x] = inclusive surface area of boxes[x..count-1] + float larea = areas[i], rarea = areas[(count - 1 - (i + 1)) + count]; float cost = larea * float(int(lsplit)) + rarea * float(int(rsplit)); if (cost > bestcost) continue; - // fill cost; use floating point math to avoid expensive integer modulo - int lrest = int(float(int(lsplit + max - 1)) * rmaxf) * int(max) - int(lsplit); - int rrest = int(float(int(rsplit + max - 1)) * rmaxf) * int(max) - int(rsplit); + // use vertex fill when splitting vertex limited clusters; note that we use the same (left->right) vertex count + // using bidirectional vertex counts is a little more expensive to compute and produces slightly worse results in practice + size_t lfill = vertices ? vertices[i] : lsplit; + size_t rfill = vertices ? vertices[i] : rsplit; + + // fill cost; use floating point math to round up to maxfill to avoid expensive integer modulo + int lrest = int(float(int(lfill + maxfill - 1)) * rmaxfill) * int(maxfill) - int(lfill); + int rrest = int(float(int(rfill + maxfill - 1)) * rmaxfill) * int(maxfill) - int(rfill); cost += fill * (float(lrest) * larea + float(rrest) * rarea); @@ -971,11 +1050,8 @@ static void bvhPartition(unsigned int* target, const unsigned int* order, const static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* ordery, unsigned int* orderz, unsigned char* boundary, size_t count, int depth, void* scratch, short* used, const unsigned int* indices, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight) { - if (depth >= kMeshletMaxTreeDepth) - return bvhPackTail(boundary, orderx, count, used, indices, max_vertices, max_triangles); - - if (count <= max_triangles && bvhPackLeaf(boundary, orderx, count, used, indices, max_vertices)) - return; + if (count <= max_triangles && bvhCountVertices(orderx, count, used, indices) <= max_vertices) + return bvhPackLeaf(boundary, count); unsigned int* axes[3] = {orderx, ordery, orderz}; @@ -984,9 +1060,7 @@ static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* or // if we could not pack the meshlet, we must be vertex bound size_t mint = count <= max_triangles && max_vertices / 3 < min_triangles ? max_vertices / 3 : min_triangles; - - // only use fill weight if we are optimizing for triangle count - float fill = count <= max_triangles ? 0.f : fill_weight; + size_t maxfill = count <= max_triangles ? max_vertices : max_triangles; // find best split that minimizes SAH int bestk = -1; @@ -995,8 +1069,20 @@ static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* or for (int k = 0; k < 3; ++k) { + float* areas = static_cast(scratch); + unsigned int* vertices = NULL; + + bvhComputeArea(areas, boxes, axes[k], count); + + if (count <= max_triangles) + { + // for vertex bound clusters, count number of unique vertices for each split + vertices = reinterpret_cast(areas + 2 * count); + bvhCountVertices(axes[k], count, used, indices, vertices); + } + float axiscost = FLT_MAX; - size_t axissplit = bvhPivot(boxes, axes[k], count, scratch, step, mint, max_triangles, fill, &axiscost); + size_t axissplit = bvhPivot(areas, vertices, count, step, mint, max_triangles, fill_weight, maxfill, &axiscost); if (axissplit && axiscost < bestcost) { @@ -1006,8 +1092,8 @@ static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* or } } - // this may happen if SAH costs along the admissible splits are NaN - if (bestk < 0) + // this may happen if SAH costs along the admissible splits are NaN, or due to imbalanced splits on pathological inputs + if (bestk < 0 || depth >= kMeshletMaxTreeDepth) return bvhPackTail(boundary, orderx, count, used, indices, max_vertices, max_triangles); // mark sides of split for partitioning @@ -1032,6 +1118,7 @@ static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* or bvhPartition(axis, temp, sides, bestsplit, count); } + // recursion depth is bounded due to max depth check above bvhSplit(boxes, orderx, ordery, orderz, boundary, bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight); bvhSplit(boxes, orderx + bestsplit, ordery + bestsplit, orderz + bestsplit, boundary + bestsplit, count - bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight); } @@ -1045,7 +1132,6 @@ size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_ assert(index_count % 3 == 0); assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices); assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles); - assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned (void)kMeshletMaxVertices; (void)kMeshletMaxTriangles; @@ -1070,9 +1156,8 @@ size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshle assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices); assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= kMeshletMaxTriangles); - assert(min_triangles % 4 == 0 && max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned - assert(cone_weight <= 1); // negative cone weight switches metric to optimize for axis-aligned meshlets + assert(cone_weight >= 0 && cone_weight <= 1); assert(split_factor >= 0); if (index_count == 0) @@ -1108,7 +1193,7 @@ size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshle kdindices[i] = unsigned(i); KDNode* nodes = allocator.allocate(face_count * 2); - kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8); + kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8, 0); // find a specific corner of the mesh to use as a starting point for meshlet flow float cornerx = FLT_MAX, cornery = FLT_MAX, cornerz = FLT_MAX; @@ -1124,7 +1209,7 @@ size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshle // index of the vertex in the meshlet, -1 if the vertex isn't used short* used = allocator.allocate(vertex_count); - memset(used, -1, vertex_count * sizeof(short)); + clearUsed(used, vertex_count, indices, index_count); // initial seed triangle is the one closest to the corner unsigned int initial_seed = ~0u; @@ -1134,7 +1219,8 @@ size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshle { const Cone& tri = triangles[i]; - float score = getDistance(tri.px - cornerx, tri.py - cornery, tri.pz - cornerz, false); + float dx = tri.px - cornerx, dy = tri.py - cornery, dz = tri.pz - cornerz; + float score = sqrtf(dx * dx + dy * dy + dz * dz); if (initial_seed == ~0u || score < initial_score) { @@ -1174,7 +1260,7 @@ size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshle unsigned int index = ~0u; float distance = FLT_MAX; - kdtreeNearest(nodes, 0, &triangles[0].px, sizeof(Cone) / sizeof(float), emitted_flags, position, cone_weight < 0.f, index, distance); + kdtreeNearest(nodes, 0, &triangles[0].px, sizeof(Cone) / sizeof(float), emitted_flags, position, index, distance); best_triangle = index; split = meshlet.triangle_count >= min_triangles && split_factor > 0 && distance > meshlet_expected_radius * split_factor; @@ -1244,20 +1330,15 @@ size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshle } if (meshlet.triangle_count) - { - finishMeshlet(meshlet, meshlet_triangles); - meshlets[meshlet_offset++] = meshlet; - } assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, min_triangles)); + assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count); return meshlet_offset; } size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight) { - assert(cone_weight >= 0); // to use negative cone weight, use meshopt_buildMeshletsFlex - return meshopt_buildMeshletsFlex(meshlets, meshlet_vertices, meshlet_triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, max_triangles, max_triangles, cone_weight, 0.0f); } @@ -1269,13 +1350,12 @@ size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshle assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices); assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles); - assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned meshopt_Allocator allocator; // index of the vertex in the meshlet, -1 if the vertex isn't used short* used = allocator.allocate(vertex_count); - memset(used, -1, vertex_count * sizeof(short)); + clearUsed(used, vertex_count, indices, index_count); meshopt_Meshlet meshlet = {}; size_t meshlet_offset = 0; @@ -1290,13 +1370,10 @@ size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshle } if (meshlet.triangle_count) - { - finishMeshlet(meshlet, meshlet_triangles); - meshlets[meshlet_offset++] = meshlet; - } assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles)); + assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count); return meshlet_offset; } @@ -1310,7 +1387,6 @@ size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned i assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices); assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= kMeshletMaxTriangles); - assert(min_triangles % 4 == 0 && max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned if (index_count == 0) return 0; @@ -1321,13 +1397,14 @@ size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned i meshopt_Allocator allocator; // 3 floats plus 1 uint for sorting, or - // 2 floats for SAH costs, or + // 2 floats plus 1 uint for pivoting, or // 1 uint plus 1 byte for partitioning float* scratch = allocator.allocate(face_count * 4); // compute bounding boxes and centroids for sorting - BVHBox* boxes = allocator.allocate(face_count); + BVHBox* boxes = allocator.allocate(face_count + 1); // padding for SIMD bvhPrepare(boxes, scratch, indices, face_count, vertex_positions, vertex_count, vertex_stride_float); + memset(boxes + face_count, 0, sizeof(BVHBox)); unsigned int* axes = allocator.allocate(face_count * 3); unsigned int* temp = reinterpret_cast(scratch) + face_count * 3; @@ -1351,7 +1428,7 @@ size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned i // index of the vertex in the meshlet, -1 if the vertex isn't used short* used = allocator.allocate(vertex_count); - memset(used, -1, vertex_count * sizeof(short)); + clearUsed(used, vertex_count, indices, index_count); unsigned char* boundary = allocator.allocate(face_count); @@ -1392,13 +1469,10 @@ size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned i } if (meshlet.triangle_count) - { - finishMeshlet(meshlet, meshlet_triangles); - meshlets[meshlet_offset++] = meshlet; - } assert(meshlet_offset <= meshlet_bound); + assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count); return meshlet_offset; } @@ -1694,3 +1768,6 @@ void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* mesh assert(vertex_offset <= vertex_count); memcpy(vertices, order, vertex_offset * sizeof(unsigned int)); } + +#undef SIMD_SSE +#undef SIMD_NEON diff --git a/thirdparty/meshoptimizer/meshoptimizer.h b/thirdparty/meshoptimizer/meshoptimizer.h index e7fb2fecfe0..c9239bc301f 100644 --- a/thirdparty/meshoptimizer/meshoptimizer.h +++ b/thirdparty/meshoptimizer/meshoptimizer.h @@ -1,5 +1,5 @@ /** - * meshoptimizer - version 0.25 + * meshoptimizer - version 1.0 * * Copyright (C) 2016-2025, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) * Report bugs and download new versions at https://github.com/zeux/meshoptimizer @@ -12,7 +12,7 @@ #include /* Version macro; major * 1000 + minor * 10 + patch */ -#define MESHOPTIMIZER_VERSION 250 /* 0.25 */ +#define MESHOPTIMIZER_VERSION 1000 /* 1.0 */ /* If no API is defined, assume default */ #ifndef MESHOPTIMIZER_API @@ -125,14 +125,14 @@ MESHOPTIMIZER_API void meshopt_generateShadowIndexBuffer(unsigned int* destinati MESHOPTIMIZER_API void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count); /** - * Experimental: Generates a remap table that maps all vertices with the same position to the same (existing) index. + * Generates a remap table that maps all vertices with the same position to the same (existing) index. * Similarly to meshopt_generateShadowIndexBuffer, this can be helpful to pre-process meshes for position-only rendering. * This can also be used to implement algorithms that require positional-only connectivity, such as hierarchical simplification. * * destination must contain enough space for the resulting remap table (vertex_count elements) * vertex_positions should have float3 position in the first 12 bytes of each vertex */ -MESHOPTIMIZER_EXPERIMENTAL void meshopt_generatePositionRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); +MESHOPTIMIZER_API void meshopt_generatePositionRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); /** * Generate index buffer that can be used as a geometry shader input with triangle adjacency topology @@ -179,7 +179,7 @@ MESHOPTIMIZER_API size_t meshopt_generateProvokingIndexBuffer(unsigned int* dest /** * Vertex transform cache optimizer * Reorders indices to reduce the number of GPU vertex shader invocations - * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually. + * If index buffer contains multiple ranges for multiple draw calls, this function needs to be called on each range individually. * * destination must contain enough space for the resulting index buffer (index_count elements) */ @@ -198,7 +198,7 @@ MESHOPTIMIZER_API void meshopt_optimizeVertexCacheStrip(unsigned int* destinatio * Vertex transform cache optimizer for FIFO caches * Reorders indices to reduce the number of GPU vertex shader invocations * Generally takes ~3x less time to optimize meshes but produces inferior results compared to meshopt_optimizeVertexCache - * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually. + * If index buffer contains multiple ranges for multiple draw calls, this function needs to be called on each range individually. * * destination must contain enough space for the resulting index buffer (index_count elements) * cache_size should be less than the actual GPU cache size to avoid cache thrashing @@ -208,7 +208,7 @@ MESHOPTIMIZER_API void meshopt_optimizeVertexCacheFifo(unsigned int* destination /** * Overdraw optimizer * Reorders indices to reduce the number of GPU vertex shader invocations and the pixel overdraw - * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually. + * If index buffer contains multiple ranges for multiple draw calls, this function needs to be called on each range individually. * * destination must contain enough space for the resulting index buffer (index_count elements) * indices must contain index data that is the result of meshopt_optimizeVertexCache (*not* the original mesh indices!) @@ -221,7 +221,7 @@ MESHOPTIMIZER_API void meshopt_optimizeOverdraw(unsigned int* destination, const * Vertex fetch cache optimizer * Reorders vertices and changes indices to reduce the amount of GPU memory fetches during vertex processing * Returns the number of unique vertices, which is the same as input vertex count unless some vertices are unused - * This functions works for a single vertex stream; for multiple vertex streams, use meshopt_optimizeVertexFetchRemap + meshopt_remapVertexBuffer for each stream. + * This function works for a single vertex stream; for multiple vertex streams, use meshopt_optimizeVertexFetchRemap + meshopt_remapVertexBuffer for each stream. * * destination must contain enough space for the resulting vertex buffer (vertex_count elements) * indices is used both as an input and as an output index buffer @@ -251,7 +251,8 @@ MESHOPTIMIZER_API size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t MESHOPTIMIZER_API size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count); /** - * Set index encoder format version + * Set index encoder format version (defaults to 1) + * * version must specify the data format version to encode; valid values are 0 (decodable by all library versions) and 1 (decodable by 0.14+) */ MESHOPTIMIZER_API void meshopt_encodeIndexVersion(int version); @@ -303,6 +304,7 @@ MESHOPTIMIZER_API int meshopt_decodeIndexSequence(void* destination, size_t inde * For maximum efficiency the vertex buffer being encoded has to be quantized and optimized for locality of reference (cache/fetch) first. * * buffer must contain enough space for the encoded vertex buffer (use meshopt_encodeVertexBufferBound to compute worst case size) + * vertex_size must be a multiple of 4 (and <= 256) */ MESHOPTIMIZER_API size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size); MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size); @@ -313,13 +315,16 @@ MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferBound(size_t vertex_count, si * For compression level to take effect, the vertex encoding version must be set to 1. * The default compression level implied by meshopt_encodeVertexBuffer is 2. * + * buffer must contain enough space for the encoded vertex buffer (use meshopt_encodeVertexBufferBound to compute worst case size) + * vertex_size must be a multiple of 4 (and <= 256) * level should be in the range [0, 3] with 0 being the fastest and 3 being the slowest and producing the best compression ratio. * version should be -1 to use the default version (specified via meshopt_encodeVertexVersion), or 0/1 to override the version; per above, level won't take effect if version is 0. */ MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level, int version); /** - * Set vertex encoder format version + * Set vertex encoder format version (defaults to 1) + * * version must specify the data format version to encode; valid values are 0 (decodable by all library versions) and 1 (decodable by 0.23+) */ MESHOPTIMIZER_API void meshopt_encodeVertexVersion(int version); @@ -331,6 +336,7 @@ MESHOPTIMIZER_API void meshopt_encodeVertexVersion(int version); * The decoder is safe to use for untrusted input, but it may produce garbage data. * * destination must contain enough space for the resulting vertex buffer (vertex_count * vertex_size bytes) + * vertex_size must be a multiple of 4 (and <= 256) */ MESHOPTIMIZER_API int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size); @@ -345,29 +351,29 @@ MESHOPTIMIZER_API int meshopt_decodeVertexVersion(const unsigned char* buffer, s * Vertex buffer filters * These functions can be used to filter output of meshopt_decodeVertexBuffer in-place. * - * meshopt_decodeFilterOct decodes octahedral encoding of a unit vector with K-bit (K <= 16) signed X/Y as an input; Z must store 1.0f. + * meshopt_decodeFilterOct decodes octahedral encoding of a unit vector with K-bit signed X/Y as an input; Z must store 1.0f. * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. W is preserved as is. * - * meshopt_decodeFilterQuat decodes 3-component quaternion encoding with K-bit (4 <= K <= 16) component encoding and a 2-bit component index indicating which component to reconstruct. + * meshopt_decodeFilterQuat decodes 3-component quaternion encoding with K-bit component encoding and a 2-bit component index indicating which component to reconstruct. * Each component is stored as an 16-bit integer; stride must be equal to 8. * * meshopt_decodeFilterExp decodes exponential encoding of floating-point data with 8-bit exponent and 24-bit integer mantissa as 2^E*M. * Each 32-bit component is decoded in isolation; stride must be divisible by 4. * - * Experimental: meshopt_decodeFilterColor decodes YCoCg (+A) color encoding where RGB is converted to YCoCg space with variable bit quantization. + * meshopt_decodeFilterColor decodes RGBA colors from YCoCg (+A) color encoding where RGB is converted to YCoCg space with K-bit component encoding, and A is stored using K-1 bits. * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. */ MESHOPTIMIZER_API void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride); MESHOPTIMIZER_API void meshopt_decodeFilterQuat(void* buffer, size_t count, size_t stride); MESHOPTIMIZER_API void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride); -MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterColor(void* buffer, size_t count, size_t stride); +MESHOPTIMIZER_API void meshopt_decodeFilterColor(void* buffer, size_t count, size_t stride); /** * Vertex buffer filter encoders * These functions can be used to encode data in a format that meshopt_decodeFilter can decode * - * meshopt_encodeFilterOct encodes unit vectors with K-bit (K <= 16) signed X/Y as an output. - * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. W is preserved as is. + * meshopt_encodeFilterOct encodes unit vectors with K-bit (2 <= K <= 16) signed X/Y as an output. + * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. Z will store 1.0f, W is preserved as is. * Input data must contain 4 floats for every vector (count*4 total). * * meshopt_encodeFilterQuat encodes unit quaternions with K-bit (4 <= K <= 16) component encoding. @@ -378,7 +384,7 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterColor(void* buffer, size_t c * Exponent can be shared between all components of a given vector as defined by stride or all values of a given component; stride must be divisible by 4. * Input data must contain stride/4 floats for every vector (count*stride/4 total). * - * Experimental: meshopt_encodeFilterColor encodes RGBA color data by converting RGB to YCoCg color space with variable bit quantization. + * meshopt_encodeFilterColor encodes RGBA color data by converting RGB to YCoCg color space with K-bit (2 <= K <= 16) component encoding; A is stored using K-1 bits. * Each component is stored as an 8-bit or 16-bit integer; stride must be equal to 4 or 8. * Input data must contain 4 floats for every color (count*4 total). */ @@ -397,7 +403,7 @@ enum meshopt_EncodeExpMode MESHOPTIMIZER_API void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data); MESHOPTIMIZER_API void meshopt_encodeFilterQuat(void* destination, size_t count, size_t stride, int bits, const float* data); MESHOPTIMIZER_API void meshopt_encodeFilterExp(void* destination, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode); -MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterColor(void* destination, size_t count, size_t stride, int bits, const float* data); +MESHOPTIMIZER_API void meshopt_encodeFilterColor(void* destination, size_t count, size_t stride, int bits, const float* data); /** * Simplification options @@ -412,7 +418,7 @@ enum meshopt_SimplifyErrorAbsolute = 1 << 2, /* Remove disconnected parts of the mesh during simplification incrementally, regardless of the topological restrictions inside components. */ meshopt_SimplifyPrune = 1 << 3, - /* Experimental: Produce more regular triangle sizes and shapes during simplification, at some cost to geometric quality. */ + /* Produce more regular triangle sizes and shapes during simplification, at some cost to geometric and attribute quality. */ meshopt_SimplifyRegularize = 1 << 4, /* Experimental: Allow collapses across attribute discontinuities, except for vertices that are tagged with meshopt_SimplifyVertex_Protect in vertex_lock. */ meshopt_SimplifyPermissive = 1 << 5, @@ -472,7 +478,7 @@ MESHOPTIMIZER_API size_t meshopt_simplify(unsigned int* destination, const unsig MESHOPTIMIZER_API size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error); /** - * Experimental: Mesh simplifier with position/attribute update + * Mesh simplifier with position/attribute update * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible. * Similar to meshopt_simplifyWithAttributes, but destructively updates positions and attribute values for optimal appearance. * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error. @@ -492,10 +498,10 @@ MESHOPTIMIZER_API size_t meshopt_simplifyWithAttributes(unsigned int* destinatio * options must be a bitmask composed of meshopt_SimplifyX options; 0 is a safe default * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification */ -MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithUpdate(unsigned int* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error); +MESHOPTIMIZER_API size_t meshopt_simplifyWithUpdate(unsigned int* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error); /** - * Experimental: Mesh simplifier (sloppy) + * Mesh simplifier (sloppy) * Reduces the number of triangles in the mesh, sacrificing mesh appearance for simplification performance * The algorithm doesn't preserve mesh topology but can stop short of the target goal based on target error. * Returns the number of indices after simplification, with destination containing new index data @@ -508,7 +514,7 @@ MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithUpdate(unsigned int* indic * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1] * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification */ -MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error); +MESHOPTIMIZER_API size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error); /** * Mesh simplifier (pruner) @@ -639,7 +645,7 @@ struct meshopt_Meshlet unsigned int vertex_offset; unsigned int triangle_offset; - /* number of vertices and triangles used in the meshlet; data is stored in consecutive range defined by offset and count */ + /* number of vertices and triangles used in the meshlet; data is stored in consecutive range [offset..offset+count) for vertices and [offset..offset+count*3) for triangles */ unsigned int vertex_count; unsigned int triangle_count; }; @@ -653,10 +659,10 @@ struct meshopt_Meshlet * When using buildMeshletsScan, for maximum efficiency the index buffer being converted has to be optimized for vertex cache first. * * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound - * meshlet_vertices must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_vertices - * meshlet_triangles must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_triangles * 3 + * meshlet_vertices must contain enough space for all meshlets, worst case is index_count elements (*not* vertex_count!) + * meshlet_triangles must contain enough space for all meshlets, worst case is index_count elements * vertex_positions should have float3 position in the first 12 bytes of each vertex - * max_vertices and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; max_triangles must be divisible by 4) + * max_vertices and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512) * cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency */ MESHOPTIMIZER_API size_t meshopt_buildMeshlets(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight); @@ -664,40 +670,38 @@ MESHOPTIMIZER_API size_t meshopt_buildMeshletsScan(struct meshopt_Meshlet* meshl MESHOPTIMIZER_API size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles); /** - * Experimental: Meshlet builder with flexible cluster sizes + * Meshlet builder with flexible cluster sizes * Splits the mesh into a set of meshlets, similarly to meshopt_buildMeshlets, but allows to specify minimum and maximum number of triangles per meshlet. * Clusters between min and max triangle counts are split when the cluster size would have exceeded the expected cluster size by more than split_factor. - * Additionally, allows to switch to axis aligned clusters by setting cone_weight to a negative value. * - * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound using min_triangles (not max!) - * meshlet_vertices must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_vertices - * meshlet_triangles must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_triangles * 3 + * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound using min_triangles (*not* max!) + * meshlet_vertices must contain enough space for all meshlets, worst case is index_count elements (*not* vertex_count!) + * meshlet_triangles must contain enough space for all meshlets, worst case is index_count elements * vertex_positions should have float3 position in the first 12 bytes of each vertex - * max_vertices, min_triangles and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; min_triangles <= max_triangles; both min_triangles and max_triangles must be divisible by 4) - * cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency; additionally, cone_weight can be set to a negative value to prioritize axis aligned clusters (for raytracing) instead + * max_vertices, min_triangles and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; min_triangles <= max_triangles) + * cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency * split_factor should be set to a non-negative value; when greater than 0, clusters that have large bounds may be split unless they are under the min_triangles threshold */ -MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsFlex(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor); +MESHOPTIMIZER_API size_t meshopt_buildMeshletsFlex(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor); /** - * Experimental: Meshlet builder that produces clusters optimized for raytracing + * Meshlet builder that produces clusters optimized for raytracing * Splits the mesh into a set of meshlets, similarly to meshopt_buildMeshlets, but optimizes cluster subdivision for raytracing and allows to specify minimum and maximum number of triangles per meshlet. * - * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound using min_triangles (not max!) - * meshlet_vertices must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_vertices - * meshlet_triangles must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_triangles * 3 + * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound using min_triangles (*not* max!) + * meshlet_vertices must contain enough space for all meshlets, worst case is index_count elements (*not* vertex_count!) + * meshlet_triangles must contain enough space for all meshlets, worst case is index_count elements * vertex_positions should have float3 position in the first 12 bytes of each vertex - * max_vertices, min_triangles and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; min_triangles <= max_triangles; both min_triangles and max_triangles must be divisible by 4) + * max_vertices, min_triangles and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; min_triangles <= max_triangles) * fill_weight allows to prioritize clusters that are closer to maximum size at some cost to SAH quality; 0.5 is a safe default */ -MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight); +MESHOPTIMIZER_API size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight); /** * Meshlet optimizer - * Reorders meshlet vertices and triangles to maximize locality to improve rasterizer throughput + * Reorders meshlet vertices and triangles to maximize locality which can improve rasterizer throughput or ray tracing performance when using fast-build modes. * - * meshlet_triangles and meshlet_vertices must refer to meshlet triangle and vertex index data; when buildMeshlets* is used, these - * need to be computed from meshlet's vertex_offset and triangle_offset + * meshlet_triangles and meshlet_vertices must refer to meshlet data; when buildMeshlets* is used, these need to be computed from meshlet's vertex_offset and triangle_offset * triangle_count and vertex_count must not exceed implementation limits (vertex_count <= 256, triangle_count <= 512) */ MESHOPTIMIZER_API void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count); @@ -756,13 +760,14 @@ MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeSphereBounds(const float* /** * Cluster partitioner * Partitions clusters into groups of similar size, prioritizing grouping clusters that share vertices or are close to each other. + * When vertex positions are not provided, only clusters that share vertices will be grouped together, which may result in small partitions for some inputs. * * destination must contain enough space for the resulting partition data (cluster_count elements) * destination[i] will contain the partition id for cluster i, with the total number of partitions returned by the function * cluster_indices should have the vertex indices referenced by each cluster, stored sequentially * cluster_index_counts should have the number of indices in each cluster; sum of all cluster_index_counts must be equal to total_index_count - * vertex_positions should have float3 position in the first 12 bytes of each vertex (or can be NULL if not used) - * target_partition_size is a target size for each partition, in clusters; the resulting partitions may be smaller or larger + * vertex_positions can be NULL; when it's not NULL, it should have float3 position in the first 12 bytes of each vertex + * target_partition_size is a target size for each partition, in clusters; the resulting partitions may be smaller or larger (up to target + target/3) */ MESHOPTIMIZER_API size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size); @@ -805,7 +810,7 @@ MESHOPTIMIZER_API unsigned short meshopt_quantizeHalf(float v); /** * Quantize a float into a floating point value with a limited number of significant mantissa bits, preserving the IEEE-754 fp32 binary representation - * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest + * Preserves infinities/NaN, flushes denormals to zero, rounds to nearest * Assumes N is in a valid mantissa precision range, which is 1..23 */ MESHOPTIMIZER_API float meshopt_quantizeFloat(float v, int N); @@ -904,13 +909,15 @@ inline size_t meshopt_simplifyWithUpdate(T* indices, size_t index_count, float* template inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error = NULL); template +inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error = NULL); +template inline size_t meshopt_simplifyPrune(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error); template inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_count, size_t vertex_count, T restart_index); template inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_count, T restart_index); template -inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int buffer_size); +inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size); template inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size); template @@ -1288,6 +1295,15 @@ inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t in return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, NULL, target_index_count, target_error, result_error); } +template +inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error) +{ + meshopt_IndexAdapter in(NULL, indices, index_count); + meshopt_IndexAdapter out(destination, NULL, index_count); + + return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, vertex_lock, target_index_count, target_error, result_error); +} + template inline size_t meshopt_simplifyPrune(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error) { @@ -1316,11 +1332,11 @@ inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_ } template -inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int buffer_size) +inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size) { meshopt_IndexAdapter in(NULL, indices, index_count); - return meshopt_analyzeVertexCache(in.data, index_count, vertex_count, cache_size, warp_size, buffer_size); + return meshopt_analyzeVertexCache(in.data, index_count, vertex_count, cache_size, warp_size, primgroup_size); } template diff --git a/thirdparty/meshoptimizer/partition.cpp b/thirdparty/meshoptimizer/partition.cpp index 3edc8644216..4119a53ed2c 100644 --- a/thirdparty/meshoptimizer/partition.cpp +++ b/thirdparty/meshoptimizer/partition.cpp @@ -10,6 +10,9 @@ namespace meshopt { +// To avoid excessive recursion for malformed inputs, we switch to bisection after some depth +const int kMergeDepthCutoff = 40; + struct ClusterAdjacency { unsigned int* offsets; @@ -52,49 +55,44 @@ static void filterClusterIndices(unsigned int* data, unsigned int* offsets, cons offsets[cluster_count] = unsigned(cluster_write); } -static void computeClusterBounds(float* cluster_bounds, const unsigned int* cluster_indices, const unsigned int* cluster_offsets, size_t cluster_count, const float* vertex_positions, size_t vertex_positions_stride) +static float computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_positions_stride, float* out_center) { size_t vertex_stride_float = vertex_positions_stride / sizeof(float); - for (size_t i = 0; i < cluster_count; ++i) + float center[3] = {0, 0, 0}; + + // approximate center of the cluster by averaging all vertex positions + for (size_t j = 0; j < index_count; ++j) { - float center[3] = {0, 0, 0}; + const float* p = vertex_positions + indices[j] * vertex_stride_float; - // approximate center of the cluster by averaging all vertex positions - for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j) - { - const float* p = vertex_positions + cluster_indices[j] * vertex_stride_float; - - center[0] += p[0]; - center[1] += p[1]; - center[2] += p[2]; - } - - // note: technically clusters can't be empty per meshopt_partitionCluster but we check for a division by zero in case that changes - if (size_t cluster_size = cluster_offsets[i + 1] - cluster_offsets[i]) - { - center[0] /= float(cluster_size); - center[1] /= float(cluster_size); - center[2] /= float(cluster_size); - } - - // compute radius of the bounding sphere for each cluster - float radiussq = 0; - - for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j) - { - const float* p = vertex_positions + cluster_indices[j] * vertex_stride_float; - - float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]); - - radiussq = radiussq < d2 ? d2 : radiussq; - } - - cluster_bounds[i * 4 + 0] = center[0]; - cluster_bounds[i * 4 + 1] = center[1]; - cluster_bounds[i * 4 + 2] = center[2]; - cluster_bounds[i * 4 + 3] = sqrtf(radiussq); + center[0] += p[0]; + center[1] += p[1]; + center[2] += p[2]; } + + // note: technically clusters can't be empty per meshopt_partitionCluster but we check for a division by zero in case that changes + if (index_count) + { + center[0] /= float(index_count); + center[1] /= float(index_count); + center[2] /= float(index_count); + } + + // compute radius of the bounding sphere for each cluster + float radiussq = 0; + + for (size_t j = 0; j < index_count; ++j) + { + const float* p = vertex_positions + indices[j] * vertex_stride_float; + + float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]); + + radiussq = radiussq < d2 ? d2 : radiussq; + } + + memcpy(out_center, center, sizeof(center)); + return sqrtf(radiussq); } static void buildClusterAdjacency(ClusterAdjacency& adjacency, const unsigned int* cluster_indices, const unsigned int* cluster_offsets, size_t cluster_count, size_t vertex_count, meshopt_Allocator& allocator) @@ -211,6 +209,9 @@ struct ClusterGroup int next; unsigned int size; // 0 unless root unsigned int vertices; + + float center[3]; + float radius; }; struct GroupOrder @@ -285,15 +286,18 @@ static unsigned int countShared(const ClusterGroup* groups, int group1, int grou return total; } -static void mergeBounds(float* target, const float* source) +static void mergeBounds(ClusterGroup& target, const ClusterGroup& source) { - float r1 = target[3], r2 = source[3]; - float dx = source[0] - target[0], dy = source[1] - target[1], dz = source[2] - target[2]; + float r1 = target.radius, r2 = source.radius; + float dx = source.center[0] - target.center[0], dy = source.center[1] - target.center[1], dz = source.center[2] - target.center[2]; float d = sqrtf(dx * dx + dy * dy + dz * dz); if (d + r1 < r2) { - memcpy(target, source, 4 * sizeof(float)); + target.center[0] = source.center[0]; + target.center[1] = source.center[1]; + target.center[2] = source.center[2]; + target.radius = source.radius; return; } @@ -301,17 +305,17 @@ static void mergeBounds(float* target, const float* source) { float k = d > 0 ? (d + r2 - r1) / (2 * d) : 0.f; - target[0] += dx * k; - target[1] += dy * k; - target[2] += dz * k; - target[3] = (d + r2 + r1) / 2; + target.center[0] += dx * k; + target.center[1] += dy * k; + target.center[2] += dz * k; + target.radius = (d + r2 + r1) / 2; } } -static float boundsScore(const float* target, const float* source) +static float boundsScore(const ClusterGroup& target, const ClusterGroup& source) { - float r1 = target[3], r2 = source[3]; - float dx = source[0] - target[0], dy = source[1] - target[1], dz = source[2] - target[2]; + float r1 = target.radius, r2 = source.radius; + float dx = source.center[0] - target.center[0], dy = source.center[1] - target.center[1], dz = source.center[2] - target.center[2]; float d = sqrtf(dx * dx + dy * dy + dz * dz); float mr = d + r1 < r2 ? r2 : (d + r2 < r1 ? r1 : (d + r2 + r1) / 2); @@ -319,7 +323,7 @@ static float boundsScore(const float* target, const float* source) return mr > 0 ? r1 / mr : 0.f; } -static int pickGroupToMerge(const ClusterGroup* groups, int id, const ClusterAdjacency& adjacency, size_t max_partition_size, const float* cluster_bounds) +static int pickGroupToMerge(const ClusterGroup* groups, int id, const ClusterAdjacency& adjacency, size_t max_partition_size, bool use_bounds) { assert(groups[id].size > 0); @@ -347,8 +351,8 @@ static int pickGroupToMerge(const ClusterGroup* groups, int id, const ClusterAdj float score = float(int(shared)) * (group_rsqrt + other_rsqrt); // incorporate spatial score to favor merging nearby groups - if (cluster_bounds) - score *= 1.f + 0.4f * boundsScore(&cluster_bounds[id * 4], &cluster_bounds[other * 4]); + if (use_bounds) + score *= 1.f + 0.4f * boundsScore(groups[id], groups[other]); if (score > best_score) { @@ -361,6 +365,120 @@ static int pickGroupToMerge(const ClusterGroup* groups, int id, const ClusterAdj return best_group; } +static void mergeLeaf(ClusterGroup* groups, unsigned int* order, size_t count, size_t target_partition_size, size_t max_partition_size) +{ + for (size_t i = 0; i < count; ++i) + { + unsigned int id = order[i]; + if (groups[id].size == 0 || groups[id].size >= target_partition_size) + continue; + + float best_score = -1.f; + int best_group = -1; + + for (size_t j = 0; j < count; ++j) + { + unsigned int other = order[j]; + if (id == other || groups[other].size == 0) + continue; + + if (groups[id].size + groups[other].size > max_partition_size) + continue; + + // favor merging nearby groups + float score = boundsScore(groups[id], groups[other]); + + if (score > best_score) + { + best_score = score; + best_group = other; + } + } + + // merge id *into* best_group; that way, we may merge more groups into the same best_group, maximizing the chance of reaching target + if (best_group != -1) + { + // combine groups by linking them together + unsigned int tail = best_group; + while (groups[tail].next >= 0) + tail = groups[tail].next; + + groups[tail].next = id; + + // update group sizes; note, we omit vertices update for simplicity as it's not used for spatial merge + groups[best_group].size += groups[id].size; + groups[id].size = 0; + + // merge bounding spheres + mergeBounds(groups[best_group], groups[id]); + groups[id].radius = 0.f; + } + } +} + +static size_t mergePartition(unsigned int* order, size_t count, const ClusterGroup* groups, int axis, float pivot) +{ + size_t m = 0; + + // invariant: elements in range [0, m) are < pivot, elements in range [m, i) are >= pivot + for (size_t i = 0; i < count; ++i) + { + float v = groups[order[i]].center[axis]; + + // swap(m, i) unconditionally + unsigned int t = order[m]; + order[m] = order[i]; + order[i] = t; + + // when v >= pivot, we swap i with m without advancing it, preserving invariants + m += v < pivot; + } + + return m; +} + +static void mergeSpatial(ClusterGroup* groups, unsigned int* order, size_t count, size_t target_partition_size, size_t max_partition_size, size_t leaf_size, int depth) +{ + size_t total = 0; + for (size_t i = 0; i < count; ++i) + total += groups[order[i]].size; + + if (total <= max_partition_size || count <= leaf_size) + return mergeLeaf(groups, order, count, target_partition_size, max_partition_size); + + float mean[3] = {}; + float vars[3] = {}; + float runc = 1, runs = 1; + + // gather statistics on the points in the subtree using Welford's algorithm + for (size_t i = 0; i < count; ++i, runc += 1.f, runs = 1.f / runc) + { + const float* point = groups[order[i]].center; + + for (int k = 0; k < 3; ++k) + { + float delta = point[k] - mean[k]; + mean[k] += delta * runs; + vars[k] += delta * (point[k] - mean[k]); + } + } + + // split axis is one where the variance is largest + int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2); + + float split = mean[axis]; + size_t middle = mergePartition(order, count, groups, axis, split); + + // enforce balance for degenerate partitions + // this also ensures recursion depth is bounded on pathological inputs + if (middle <= leaf_size / 2 || count - middle <= leaf_size / 2 || depth >= kMergeDepthCutoff) + middle = count / 2; + + // recursion depth is logarithmic and bounded due to max depth check above + mergeSpatial(groups, order, middle, target_partition_size, max_partition_size, leaf_size, depth + 1); + mergeSpatial(groups, order + middle, count - middle, target_partition_size, max_partition_size, leaf_size, depth + 1); +} + } // namespace meshopt size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size) @@ -371,7 +489,7 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* assert(vertex_positions_stride % sizeof(float) == 0); assert(target_partition_size > 0); - size_t max_partition_size = target_partition_size + target_partition_size * 3 / 8; + size_t max_partition_size = target_partition_size + target_partition_size / 3; meshopt_Allocator allocator; @@ -385,20 +503,12 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* filterClusterIndices(cluster_newindices, cluster_offsets, cluster_indices, cluster_index_counts, cluster_count, used, vertex_count, total_index_count); cluster_indices = cluster_newindices; - // compute bounding sphere for each cluster if positions are provided - float* cluster_bounds = NULL; - - if (vertex_positions) - { - cluster_bounds = allocator.allocate(cluster_count * 4); - computeClusterBounds(cluster_bounds, cluster_indices, cluster_offsets, cluster_count, vertex_positions, vertex_positions_stride); - } - // build cluster adjacency along with edge weights (shared vertex count) ClusterAdjacency adjacency = {}; buildClusterAdjacency(adjacency, cluster_indices, cluster_offsets, cluster_count, vertex_count, allocator); ClusterGroup* groups = allocator.allocate(cluster_count); + memset(groups, 0, sizeof(ClusterGroup) * cluster_count); GroupOrder* order = allocator.allocate(cluster_count); size_t pending = 0; @@ -412,6 +522,10 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* groups[i].vertices = cluster_offsets[i + 1] - cluster_offsets[i]; assert(groups[i].vertices > 0); + // compute bounding sphere for each cluster if positions are provided + if (vertex_positions) + groups[i].radius = computeClusterBounds(cluster_indices + cluster_offsets[i], cluster_offsets[i + 1] - cluster_offsets[i], vertex_positions, vertex_positions_stride, groups[i].center); + GroupOrder item = {}; item.id = unsigned(i); item.order = groups[i].vertices; @@ -439,7 +553,7 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* if (groups[top.id].size >= target_partition_size) continue; - int best_group = pickGroupToMerge(groups, top.id, adjacency, max_partition_size, cluster_bounds); + int best_group = pickGroupToMerge(groups, top.id, adjacency, max_partition_size, /* use_bounds= */ vertex_positions); // we can't grow the group any more, emit as is if (best_group == -1) @@ -449,14 +563,11 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* unsigned int shared = countShared(groups, top.id, best_group, adjacency); // combine groups by linking them together - assert(groups[best_group].size > 0); + unsigned int tail = top.id; + while (groups[tail].next >= 0) + tail = groups[tail].next; - for (int i = top.id; i >= 0; i = groups[i].next) - if (groups[i].next < 0) - { - groups[i].next = best_group; - break; - } + groups[tail].next = best_group; // update group sizes; note, the vertex update is a O(1) approximation which avoids recomputing the true size groups[top.id].size += groups[best_group].size; @@ -467,10 +578,10 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* groups[best_group].vertices = 0; // merge bounding spheres if bounds are available - if (cluster_bounds) + if (vertex_positions) { - mergeBounds(&cluster_bounds[top.id * 4], &cluster_bounds[best_group * 4]); - memset(&cluster_bounds[best_group * 4], 0, 4 * sizeof(float)); + mergeBounds(groups[top.id], groups[best_group]); + groups[best_group].radius = 0; } // re-associate all clusters back to the merged group @@ -481,6 +592,20 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* heapPush(order, pending++, top); } + // if vertex positions are provided, we do a final pass to see if we can merge small groups based on spatial locality alone + if (vertex_positions) + { + unsigned int* merge_order = reinterpret_cast(order); + size_t merge_offset = 0; + + for (size_t i = 0; i < cluster_count; ++i) + if (groups[i].size) + merge_order[merge_offset++] = unsigned(i); + + mergeSpatial(groups, merge_order, merge_offset, target_partition_size, max_partition_size, /* leaf_size= */ 8, 0); + } + + // output each remaining group size_t next_group = 0; for (size_t i = 0; i < cluster_count; ++i) diff --git a/thirdparty/meshoptimizer/simplifier.cpp b/thirdparty/meshoptimizer/simplifier.cpp index f1effc38eae..14d4d42fed8 100644 --- a/thirdparty/meshoptimizer/simplifier.cpp +++ b/thirdparty/meshoptimizer/simplifier.cpp @@ -243,14 +243,18 @@ static unsigned int* buildSparseRemap(unsigned int* indices, size_t index_count, { // use a bit set to compute the precise number of unique vertices unsigned char* filter = allocator.allocate((vertex_count + 7) / 8); - memset(filter, 0, (vertex_count + 7) / 8); + + for (size_t i = 0; i < index_count; ++i) + { + unsigned int index = indices[i]; + assert(index < vertex_count); + filter[index / 8] = 0; + } size_t unique = 0; for (size_t i = 0; i < index_count; ++i) { unsigned int index = indices[i]; - assert(index < vertex_count); - unique += (filter[index / 8] & (1 << (index % 8))) == 0; filter[index / 8] |= 1 << (index % 8); } @@ -269,7 +273,6 @@ static unsigned int* buildSparseRemap(unsigned int* indices, size_t index_count, for (size_t i = 0; i < index_count; ++i) { unsigned int index = indices[i]; - unsigned int* entry = hashLookup2(revremap, revremap_size, hasher, index, ~0u); if (*entry == ~0u) @@ -364,8 +367,8 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned memset(loopback, -1, vertex_count * sizeof(unsigned int)); // incoming & outgoing open edges: ~0u if no open edges, i if there are more than 1 - // note that this is the same data as required in loop[] arrays; loop[] data is only valid for border/seam - // but here it's okay to fill the data out for other types of vertices as well + // note that this is the same data as required in loop[] arrays; loop[] data is only used for border/seam by default + // in permissive mode we also use it to guide complex-complex collapses, so we fill it for all vertices unsigned int* openinc = loopback; unsigned int* openout = loop; @@ -617,7 +620,7 @@ static void rescaleAttributes(float* result, const float* vertex_attributes_data } } -static void finalizeVertices(float* vertex_positions_data, size_t vertex_positions_stride, float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, size_t vertex_count, const Vector3* vertex_positions, const float* vertex_attributes, const unsigned int* sparse_remap, const unsigned int* attribute_remap, float vertex_scale, const float* vertex_offset, const unsigned char* vertex_update) +static void finalizeVertices(float* vertex_positions_data, size_t vertex_positions_stride, float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, size_t vertex_count, const Vector3* vertex_positions, const float* vertex_attributes, const unsigned int* sparse_remap, const unsigned int* attribute_remap, float vertex_scale, const float* vertex_offset, const unsigned char* vertex_kind, const unsigned char* vertex_update, const unsigned char* vertex_lock) { size_t vertex_positions_stride_float = vertex_positions_stride / sizeof(float); size_t vertex_attributes_stride_float = vertex_attributes_stride / sizeof(float); @@ -629,12 +632,20 @@ static void finalizeVertices(float* vertex_positions_data, size_t vertex_positio unsigned int ri = sparse_remap ? sparse_remap[i] : unsigned(i); - const Vector3& p = vertex_positions[i]; - float* v = vertex_positions_data + ri * vertex_positions_stride_float; + // updating externally locked vertices is not allowed + if (vertex_lock && (vertex_lock[ri] & meshopt_SimplifyVertex_Lock) != 0) + continue; - v[0] = p.x * vertex_scale + vertex_offset[0]; - v[1] = p.y * vertex_scale + vertex_offset[1]; - v[2] = p.z * vertex_scale + vertex_offset[2]; + // moving locked vertices may result in floating point drift + if (vertex_kind[i] != Kind_Locked) + { + const Vector3& p = vertex_positions[i]; + float* v = vertex_positions_data + ri * vertex_positions_stride_float; + + v[0] = p.x * vertex_scale + vertex_offset[0]; + v[1] = p.y * vertex_scale + vertex_offset[1]; + v[2] = p.z * vertex_scale + vertex_offset[2]; + } if (attribute_count) { @@ -1260,6 +1271,20 @@ static float getNeighborhoodRadius(const EdgeAdjacency& adjacency, const Vector3 return sqrtf(result); } +static unsigned int getComplexTarget(unsigned int v, unsigned int target, const unsigned int* remap, const unsigned int* loop, const unsigned int* loopback) +{ + unsigned int r = remap[target]; + + // use loop metadata to guide complex collapses towards the correct wedge + // this works for edges on attribute discontinuities because loop/loopback track the single half-edge without a pair, similar to seams + if (loop[v] != ~0u && remap[loop[v]] == r) + return loop[v]; + else if (loopback[v] != ~0u && remap[loopback[v]] == r) + return loopback[v]; + else + return target; +} + static size_t boundEdgeCollapses(const EdgeAdjacency& adjacency, size_t vertex_count, size_t index_count, unsigned char* vertex_kind) { size_t dual_count = 0; @@ -1382,15 +1407,22 @@ static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const } else { - // complex edges can have multiple wedges, so we need to aggregate errors for all wedges - // this is different from seams (where we aggregate pairwise) because all wedges collapse onto the same target + // complex edges can have multiple wedges, so we need to aggregate errors for all wedges based on the selected target if (vertex_kind[i0] == Kind_Complex) for (unsigned int v = wedge[i0]; v != i0; v = wedge[v]) - ei += quadricError(attribute_quadrics[v], &attribute_gradients[v * attribute_count], attribute_count, vertex_positions[i1], &vertex_attributes[i1 * attribute_count]); + { + unsigned int t = getComplexTarget(v, i1, remap, loop, loopback); + + ei += quadricError(attribute_quadrics[v], &attribute_gradients[v * attribute_count], attribute_count, vertex_positions[t], &vertex_attributes[t * attribute_count]); + } if (vertex_kind[i1] == Kind_Complex && bidi) for (unsigned int v = wedge[i1]; v != i1; v = wedge[v]) - ej += quadricError(attribute_quadrics[v], &attribute_gradients[v * attribute_count], attribute_count, vertex_positions[i0], &vertex_attributes[i0 * attribute_count]); + { + unsigned int t = getComplexTarget(v, i0, remap, loop, loopback); + + ej += quadricError(attribute_quadrics[v], &attribute_gradients[v * attribute_count], attribute_count, vertex_positions[t], &vertex_attributes[t * attribute_count]); + } } } @@ -1542,7 +1574,9 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* do { - collapse_remap[v] = i1; + unsigned int t = getComplexTarget(v, i1, remap, loop, loopback); + + collapse_remap[v] = t; v = wedge[v]; } while (v != i0); } @@ -1634,10 +1668,10 @@ static void updateQuadrics(const unsigned int* collapse_remap, size_t vertex_cou } } -static void solveQuadrics(Vector3* vertex_positions, float* vertex_attributes, size_t vertex_count, const Quadric* vertex_quadrics, const QuadricGrad* volume_gradients, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap, const unsigned int* wedge, const EdgeAdjacency& adjacency, const unsigned char* vertex_kind, const unsigned char* vertex_update) +static void solvePositions(Vector3* vertex_positions, size_t vertex_count, const Quadric* vertex_quadrics, const QuadricGrad* volume_gradients, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap, const unsigned int* wedge, const EdgeAdjacency& adjacency, const unsigned char* vertex_kind, const unsigned char* vertex_update) { #if TRACE - size_t stats[5] = {}; + size_t stats[6] = {}; #endif for (size_t i = 0; i < vertex_count; ++i) @@ -1645,7 +1679,6 @@ static void solveQuadrics(Vector3* vertex_positions, float* vertex_attributes, s if (!vertex_update[i]) continue; - // moving externally locked vertices is prohibited // moving vertices on an attribute discontinuity may result in extrapolating UV outside of the chart bounds // moving vertices on a border requires a stronger edge quadric to preserve the border geometry if (vertex_kind[i] == Kind_Locked || vertex_kind[i] == Kind_Seam || vertex_kind[i] == Kind_Border) @@ -1709,36 +1742,64 @@ static void solveQuadrics(Vector3* vertex_positions, float* vertex_attributes, s continue; } + // reject updates that increase positional error too much; allow some tolerance to improve attribute quality + if (quadricError(vertex_quadrics[i], p) > quadricError(vertex_quadrics[i], vp) * 1.5f + 1e-6f) + { + TRACESTATS(5); + continue; + } + TRACESTATS(1); vertex_positions[i] = p; } #if TRACE - printf("updated %d/%d positions; failed solve %d bounds %d flip %d\n", int(stats[1]), int(stats[0]), int(stats[2]), int(stats[3]), int(stats[4])); + printf("updated %d/%d positions; failed solve %d bounds %d flip %d error %d\n", int(stats[1]), int(stats[0]), int(stats[2]), int(stats[3]), int(stats[4]), int(stats[5])); #endif +} - if (attribute_count == 0) - return; - +static void solveAttributes(Vector3* vertex_positions, float* vertex_attributes, size_t vertex_count, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, const unsigned char* vertex_update) +{ for (size_t i = 0; i < vertex_count; ++i) { if (!vertex_update[i]) continue; - // updating externally locked vertices is prohibited - if (vertex_kind[i] == Kind_Locked) + if (remap[i] != i) continue; - const Vector3& p = vertex_positions[remap[i]]; - const Quadric& A = attribute_quadrics[i]; - - float iw = A.w == 0 ? 0.f : 1.f / A.w; - for (size_t k = 0; k < attribute_count; ++k) { - const QuadricGrad& G = attribute_gradients[i * attribute_count + k]; + unsigned int shared = ~0u; - vertex_attributes[i * attribute_count + k] = (G.gx * p.x + G.gy * p.y + G.gz * p.z + G.gw) * iw; + // for complex vertices, preserve attribute continuity and use highest weight wedge if values were shared + if (vertex_kind[i] == Kind_Complex) + { + shared = unsigned(i); + + for (unsigned int v = wedge[i]; v != i; v = wedge[v]) + if (vertex_attributes[v * attribute_count + k] != vertex_attributes[i * attribute_count + k]) + shared = ~0u; + else if (shared != ~0u && attribute_quadrics[v].w > attribute_quadrics[shared].w) + shared = v; + } + + // update attributes for all wedges + unsigned int v = unsigned(i); + do + { + unsigned int r = (shared == ~0u) ? v : shared; + + const Vector3& p = vertex_positions[i]; // same for all wedges + const Quadric& A = attribute_quadrics[r]; + const QuadricGrad& G = attribute_gradients[r * attribute_count + k]; + + float iw = A.w == 0 ? 0.f : 1.f / A.w; + float av = (G.gx * p.x + G.gy * p.y + G.gz * p.z + G.gw) * iw; + + vertex_attributes[v * attribute_count + k] = av; + v = wedge[v]; + } while (v != i); } } } @@ -2264,7 +2325,7 @@ static float interpolate(float y, float x0, float y0, float x1, float y1, float // three point interpolation from "revenge of interpolation search" paper float num = (y1 - y) * (x1 - x2) * (x1 - x0) * (y2 - y0); float den = (y2 - y) * (x1 - x2) * (y0 - y1) + (y0 - y) * (x1 - x0) * (y1 - y2); - return x1 + num / den; + return x1 + (den == 0.f ? 0.f : num / den); } } // namespace meshopt @@ -2519,16 +2580,19 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic { unsigned int v = result[i]; - // recomputing externally locked vertices may result in floating point drift - vertex_update[v] = vertex_kind[v] != Kind_Locked; + // mark the vertex for finalizeVertices and root vertex for solve* + vertex_update[remap[v]] = vertex_update[v] = 1; } // edge adjacency may be stale as we haven't updated it after last series of edge collapses updateEdgeAdjacency(adjacency, result, result_count, vertex_count, remap); - solveQuadrics(vertex_positions, vertex_attributes, vertex_count, vertex_quadrics, volume_gradients, attribute_quadrics, attribute_gradients, attribute_count, remap, wedge, adjacency, vertex_kind, vertex_update); + solvePositions(vertex_positions, vertex_count, vertex_quadrics, volume_gradients, attribute_quadrics, attribute_gradients, attribute_count, remap, wedge, adjacency, vertex_kind, vertex_update); - finalizeVertices(const_cast(vertex_positions_data), vertex_positions_stride, const_cast(vertex_attributes_data), vertex_attributes_stride, attribute_weights, attribute_count, vertex_count, vertex_positions, vertex_attributes, sparse_remap, attribute_remap, vertex_scale, vertex_offset, vertex_update); + if (attribute_count) + solveAttributes(vertex_positions, vertex_attributes, vertex_count, attribute_quadrics, attribute_gradients, attribute_count, remap, wedge, vertex_kind, vertex_update); + + finalizeVertices(const_cast(vertex_positions_data), vertex_positions_stride, const_cast(vertex_attributes_data), vertex_attributes_stride, attribute_weights, attribute_count, vertex_count, vertex_positions, vertex_attributes, sparse_remap, attribute_remap, vertex_scale, vertex_offset, vertex_kind, vertex_update, vertex_lock); } // if debug visualization data is requested, fill it instead of index data; for simplicity, this doesn't work with sparsity diff --git a/thirdparty/meshoptimizer/spatialorder.cpp b/thirdparty/meshoptimizer/spatialorder.cpp index b65627900d2..8a785fcd560 100644 --- a/thirdparty/meshoptimizer/spatialorder.cpp +++ b/thirdparty/meshoptimizer/spatialorder.cpp @@ -208,6 +208,7 @@ static void splitPoints(unsigned int* destination, unsigned int* orderx, unsigne partitionPoints(axis, temp, sides, split, count); } + // recursion depth is logarithmic and bounded as we always split in approximately half splitPoints(destination, orderx, ordery, orderz, keys, split, scratch, cluster_size); splitPoints(destination + split, orderx + split, ordery + split, orderz + split, keys, count - split, scratch, cluster_size); } diff --git a/thirdparty/meshoptimizer/vertexcodec.cpp b/thirdparty/meshoptimizer/vertexcodec.cpp index 847589b3d4e..7085cce3235 100644 --- a/thirdparty/meshoptimizer/vertexcodec.cpp +++ b/thirdparty/meshoptimizer/vertexcodec.cpp @@ -122,7 +122,7 @@ namespace meshopt const unsigned char kVertexHeader = 0xa0; -static int gEncodeVertexVersion = 0; +static int gEncodeVertexVersion = 1; const int kDecodeVertexVersion = 1; const size_t kVertexBlockSizeBytes = 8192; diff --git a/thirdparty/meshoptimizer/vertexfilter.cpp b/thirdparty/meshoptimizer/vertexfilter.cpp index af15d59c607..3fd83608369 100644 --- a/thirdparty/meshoptimizer/vertexfilter.cpp +++ b/thirdparty/meshoptimizer/vertexfilter.cpp @@ -109,28 +109,33 @@ static void decodeFilterOct(T* data, size_t count) static void decodeFilterQuat(short* data, size_t count) { - const float scale = 1.f / sqrtf(2.f); + const float scale = 32767.f / sqrtf(2.f); for (size_t i = 0; i < count; ++i) { // recover scale from the high byte of the component int sf = data[i * 4 + 3] | 3; - float ss = scale / float(sf); + float s = float(sf); - // convert x/y/z to [-1..1] (scaled...) - float x = float(data[i * 4 + 0]) * ss; - float y = float(data[i * 4 + 1]) * ss; - float z = float(data[i * 4 + 2]) * ss; + // convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf) + float x = float(data[i * 4 + 0]); + float y = float(data[i * 4 + 1]); + float z = float(data[i * 4 + 2]); - // reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors - float ww = 1.f - x * x - y * y - z * z; + // reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors + float ws = s * s; + float ww = ws * 2.f - x * x - y * y - z * z; float w = sqrtf(ww >= 0.f ? ww : 0.f); + // compute final scale; note that all computations above are unscaled + // we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range + float ss = scale / s; + // rounded signed float->int - int xf = int(x * 32767.f + (x >= 0.f ? 0.5f : -0.5f)); - int yf = int(y * 32767.f + (y >= 0.f ? 0.5f : -0.5f)); - int zf = int(z * 32767.f + (z >= 0.f ? 0.5f : -0.5f)); - int wf = int(w * 32767.f + 0.5f); + int xf = int(x * ss + (x >= 0.f ? 0.5f : -0.5f)); + int yf = int(y * ss + (y >= 0.f ? 0.5f : -0.5f)); + int zf = int(z * ss + (z >= 0.f ? 0.5f : -0.5f)); + int wf = int(w * ss + 0.5f); int qc = data[i * 4 + 3] & 3; @@ -347,7 +352,7 @@ static void decodeFilterOctSimd16(short* data, size_t count) static void decodeFilterQuatSimd(short* data, size_t count) { - const float scale = 1.f / sqrtf(2.f); + const float scale = 32767.f / sqrtf(2.f); for (size_t i = 0; i < count; i += 4) { @@ -366,24 +371,27 @@ static void decodeFilterQuatSimd(short* data, size_t count) // get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f) __m128i sf = _mm_or_si128(cf, _mm_set1_epi32(3)); - __m128 ss = _mm_div_ps(_mm_set1_ps(scale), _mm_cvtepi32_ps(sf)); + __m128 s = _mm_cvtepi32_ps(sf); - // convert x/y/z to [-1..1] (scaled...) - __m128 x = _mm_mul_ps(_mm_cvtepi32_ps(xf), ss); - __m128 y = _mm_mul_ps(_mm_cvtepi32_ps(yf), ss); - __m128 z = _mm_mul_ps(_mm_cvtepi32_ps(zf), ss); + // convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf) + __m128 x = _mm_cvtepi32_ps(xf); + __m128 y = _mm_cvtepi32_ps(yf); + __m128 z = _mm_cvtepi32_ps(zf); - // reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors - __m128 ww = _mm_sub_ps(_mm_set1_ps(1.f), _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z)))); + // reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors + __m128 ws = _mm_mul_ps(s, _mm_add_ps(s, s)); // s*2s instead of 2*(s*s) to work around clang bug with integer multiplication + __m128 ww = _mm_sub_ps(ws, _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z)))); __m128 w = _mm_sqrt_ps(_mm_max_ps(ww, _mm_setzero_ps())); - __m128 s = _mm_set1_ps(32767.f); + // compute final scale; note that all computations above are unscaled + // we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range + __m128 ss = _mm_div_ps(_mm_set1_ps(scale), s); // rounded signed float->int - __m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s)); - __m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s)); - __m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s)); - __m128i wr = _mm_cvtps_epi32(_mm_mul_ps(w, s)); + __m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, ss)); + __m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, ss)); + __m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, ss)); + __m128i wr = _mm_cvtps_epi32(_mm_mul_ps(w, ss)); // mix x/z and w/y to make 16-bit unpack easier __m128i xzr = _mm_or_si128(_mm_and_si128(xr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(zr, 16)); @@ -542,6 +550,13 @@ inline float32x4_t vdivq_f32(float32x4_t x, float32x4_t y) r = vmulq_f32(r, vrecpsq_f32(y, r)); // refine rcp estimate return vmulq_f32(x, r); } + +#ifndef __ARM_FEATURE_FMA +inline float32x4_t vfmaq_f32(float32x4_t x, float32x4_t y, float32x4_t z) +{ + return vaddq_f32(x, vmulq_f32(y, z)); +} +#endif #endif #ifdef SIMD_NEON @@ -572,23 +587,21 @@ static void decodeFilterOctSimd8(signed char* data, size_t count) y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign)))); // compute normal length & scale - float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z))); + float32x4_t ll = vfmaq_f32(vfmaq_f32(vmulq_f32(x, x), y, y), z, z); float32x4_t rl = vrsqrteq_f32(ll); float32x4_t s = vmulq_f32(vdupq_n_f32(127.f), rl); // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value - // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction + // note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction const float32x4_t fsnap = vdupq_n_f32(3 << 22); - int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap)); - int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap)); - int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap)); + int32x4_t xr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, x, s)); + int32x4_t yr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, y, s)); + int32x4_t zr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, z, s)); // combine xr/yr/zr into final value - int32x4_t res = vandq_s32(n4, vdupq_n_s32(0xff000000)); - res = vorrq_s32(res, vandq_s32(xr, vdupq_n_s32(0xff))); - res = vorrq_s32(res, vshlq_n_s32(vandq_s32(yr, vdupq_n_s32(0xff)), 8)); - res = vorrq_s32(res, vshlq_n_s32(vandq_s32(zr, vdupq_n_s32(0xff)), 16)); + int32x4_t res = vsliq_n_s32(xr, vsliq_n_s32(yr, zr, 8), 8); + res = vbslq_s32(vdupq_n_u32(0xff000000), n4, res); vst1q_s32(reinterpret_cast(&data[i * 4]), res); } @@ -626,21 +639,25 @@ static void decodeFilterOctSimd16(short* data, size_t count) y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign)))); // compute normal length & scale - float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z))); + float32x4_t ll = vfmaq_f32(vfmaq_f32(vmulq_f32(x, x), y, y), z, z); +#if !defined(__aarch64__) && !defined(_M_ARM64) float32x4_t rl = vrsqrteq_f32(ll); rl = vmulq_f32(rl, vrsqrtsq_f32(vmulq_f32(rl, ll), rl)); // refine rsqrt estimate float32x4_t s = vmulq_f32(vdupq_n_f32(32767.f), rl); +#else + float32x4_t s = vdivq_f32(vdupq_n_f32(32767.f), vsqrtq_f32(ll)); +#endif // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction const float32x4_t fsnap = vdupq_n_f32(3 << 22); - int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap)); - int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap)); - int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap)); + int32x4_t xr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, x, s)); + int32x4_t yr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, y, s)); + int32x4_t zr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, z, s)); // mix x/z and y/0 to make 16-bit unpack easier - int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16)); + int32x4_t xzr = vsliq_n_s32(xr, zr, 16); int32x4_t y0r = vandq_s32(yr, vdupq_n_s32(0xffff)); // pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w @@ -658,7 +675,7 @@ static void decodeFilterOctSimd16(short* data, size_t count) static void decodeFilterQuatSimd(short* data, size_t count) { - const float scale = 1.f / sqrtf(2.f); + const float scale = 32767.f / sqrtf(2.f); for (size_t i = 0; i < count; i += 4) { @@ -677,43 +694,52 @@ static void decodeFilterQuatSimd(short* data, size_t count) // get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f) int32x4_t sf = vorrq_s32(cf, vdupq_n_s32(3)); - float32x4_t ss = vdivq_f32(vdupq_n_f32(scale), vcvtq_f32_s32(sf)); + float32x4_t s = vcvtq_f32_s32(sf); - // convert x/y/z to [-1..1] (scaled...) - float32x4_t x = vmulq_f32(vcvtq_f32_s32(xf), ss); - float32x4_t y = vmulq_f32(vcvtq_f32_s32(yf), ss); - float32x4_t z = vmulq_f32(vcvtq_f32_s32(zf), ss); + // convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf) + float32x4_t x = vcvtq_f32_s32(xf); + float32x4_t y = vcvtq_f32_s32(yf); + float32x4_t z = vcvtq_f32_s32(zf); - // reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors - float32x4_t ww = vsubq_f32(vdupq_n_f32(1.f), vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)))); + // reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors + float32x4_t ws = vmulq_f32(s, s); + float32x4_t ww = vsubq_f32(vaddq_f32(ws, ws), vfmaq_f32(vfmaq_f32(vmulq_f32(x, x), y, y), z, z)); float32x4_t w = vsqrtq_f32(vmaxq_f32(ww, vdupq_n_f32(0.f))); - float32x4_t s = vdupq_n_f32(32767.f); + // compute final scale; note that all computations above are unscaled + // we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range + float32x4_t ss = vdivq_f32(vdupq_n_f32(scale), s); // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction const float32x4_t fsnap = vdupq_n_f32(3 << 22); - int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap)); - int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap)); - int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap)); - int32x4_t wr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(w, s), fsnap)); + int32x4_t xr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, x, ss)); + int32x4_t yr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, y, ss)); + int32x4_t zr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, z, ss)); + int32x4_t wr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, w, ss)); // mix x/z and w/y to make 16-bit unpack easier - int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16)); - int32x4_t wyr = vorrq_s32(vandq_s32(wr, vdupq_n_s32(0xffff)), vshlq_n_s32(yr, 16)); + int32x4_t xzr = vsliq_n_s32(xr, zr, 16); + int32x4_t wyr = vsliq_n_s32(wr, yr, 16); // pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0) - int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[0]); - int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[1]); + uint64x2_t res_0 = vreinterpretq_u64_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[0]); + uint64x2_t res_1 = vreinterpretq_u64_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[1]); + + // store results to stack so that we can rotate using scalar instructions + // TODO: volatile works around LLVM mis-optimizing code; https://github.com/llvm/llvm-project/issues/166808 + volatile uint64_t res[4]; + vst1q_u64(const_cast(&res[0]), res_0); + vst1q_u64(const_cast(&res[2]), res_1); // rotate and store - uint64_t* out = (uint64_t*)&data[i * 4]; + uint64_t* out = reinterpret_cast(&data[i * 4]); - out[0] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 0), vgetq_lane_s32(cf, 0) << 4); - out[1] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 1), vgetq_lane_s32(cf, 1) << 4); - out[2] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 0), vgetq_lane_s32(cf, 2) << 4); - out[3] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 1), vgetq_lane_s32(cf, 3) << 4); + out[0] = rotateleft64(res[0], data[(i + 0) * 4 + 3] << 4); + out[1] = rotateleft64(res[1], data[(i + 1) * 4 + 3] << 4); + out[2] = rotateleft64(res[2], data[(i + 2) * 4 + 3] << 4); + out[3] = rotateleft64(res[3], data[(i + 3) * 4 + 3] << 4); } } @@ -767,19 +793,16 @@ static void decodeFilterColorSimd8(unsigned char* data, size_t count) int32x4_t bf = vsubq_s32(yf, vaddq_s32(cof, cgf)); // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value - // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction + // note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction const float32x4_t fsnap = vdupq_n_f32(3 << 22); - int32x4_t rr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(rf), ss), fsnap)); - int32x4_t gr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(gf), ss), fsnap)); - int32x4_t br = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(bf), ss), fsnap)); - int32x4_t ar = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(af), ss), fsnap)); + int32x4_t rr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(rf), ss)); + int32x4_t gr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(gf), ss)); + int32x4_t br = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(bf), ss)); + int32x4_t ar = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(af), ss)); // repack rgba into final value - int32x4_t res = vandq_s32(rr, vdupq_n_s32(0xff)); - res = vorrq_s32(res, vshlq_n_s32(vandq_s32(gr, vdupq_n_s32(0xff)), 8)); - res = vorrq_s32(res, vshlq_n_s32(vandq_s32(br, vdupq_n_s32(0xff)), 16)); - res = vorrq_s32(res, vshlq_n_s32(ar, 24)); + int32x4_t res = vsliq_n_s32(rr, vsliq_n_s32(gr, vsliq_n_s32(br, ar, 8), 8), 8); vst1q_s32(reinterpret_cast(&data[i * 4]), res); } @@ -824,14 +847,14 @@ static void decodeFilterColorSimd16(unsigned short* data, size_t count) // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction const float32x4_t fsnap = vdupq_n_f32(3 << 22); - int32x4_t rr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(rf), ss), fsnap)); - int32x4_t gr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(gf), ss), fsnap)); - int32x4_t br = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(bf), ss), fsnap)); - int32x4_t ar = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(af), ss), fsnap)); + int32x4_t rr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(rf), ss)); + int32x4_t gr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(gf), ss)); + int32x4_t br = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(bf), ss)); + int32x4_t ar = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(af), ss)); // mix r/b and g/a to make 16-bit unpack easier - int32x4_t rbr = vorrq_s32(vandq_s32(rr, vdupq_n_s32(0xffff)), vshlq_n_s32(br, 16)); - int32x4_t gar = vorrq_s32(vandq_s32(gr, vdupq_n_s32(0xffff)), vshlq_n_s32(ar, 16)); + int32x4_t rbr = vsliq_n_s32(rr, br, 16); + int32x4_t gar = vsliq_n_s32(gr, ar, 16); // pack r/g/b/a using 16-bit unpacks int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(rbr), vreinterpretq_s16_s32(gar)).val[0]); @@ -958,7 +981,7 @@ static void decodeFilterOctSimd16(short* data, size_t count) static void decodeFilterQuatSimd(short* data, size_t count) { - const float scale = 1.f / sqrtf(2.f); + const float scale = 32767.f / sqrtf(2.f); for (size_t i = 0; i < count; i += 4) { @@ -977,28 +1000,31 @@ static void decodeFilterQuatSimd(short* data, size_t count) // get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f) v128_t sf = wasm_v128_or(cf, wasm_i32x4_splat(3)); - v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(scale), wasm_f32x4_convert_i32x4(sf)); + v128_t s = wasm_f32x4_convert_i32x4(sf); - // convert x/y/z to [-1..1] (scaled...) - v128_t x = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(xf), ss); - v128_t y = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(yf), ss); - v128_t z = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(zf), ss); + // convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf) + v128_t x = wasm_f32x4_convert_i32x4(xf); + v128_t y = wasm_f32x4_convert_i32x4(yf); + v128_t z = wasm_f32x4_convert_i32x4(zf); - // reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors + // reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors // note: i32x4_max with 0 is equivalent to f32x4_max - v128_t ww = wasm_f32x4_sub(wasm_f32x4_splat(1.f), wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z)))); + v128_t ws = wasm_f32x4_mul(s, s); + v128_t ww = wasm_f32x4_sub(wasm_f32x4_add(ws, ws), wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z)))); v128_t w = wasm_f32x4_sqrt(wasm_i32x4_max(ww, wasm_i32x4_splat(0))); - v128_t s = wasm_f32x4_splat(32767.f); + // compute final scale; note that all computations above are unscaled + // we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range + v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(scale), s); // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction const v128_t fsnap = wasm_f32x4_splat(3 << 22); - v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap); - v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap); - v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap); - v128_t wr = wasm_f32x4_add(wasm_f32x4_mul(w, s), fsnap); + v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, ss), fsnap); + v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, ss), fsnap); + v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, ss), fsnap); + v128_t wr = wasm_f32x4_add(wasm_f32x4_mul(w, ss), fsnap); // mix x/z and w/y to make 16-bit unpack easier v128_t xzr = wasm_v128_or(wasm_v128_and(xr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(zr, 16)); @@ -1131,7 +1157,7 @@ static void decodeFilterColorSimd16(unsigned short* data, size_t count) v128_t bf = wasm_i32x4_sub(yf, wasm_i32x4_add(cof, cgf)); // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value - // note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction + // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction const v128_t fsnap = wasm_f32x4_splat(3 << 22); v128_t rr = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(rf), ss), fsnap); @@ -1250,7 +1276,7 @@ void meshopt_decodeFilterColor(void* buffer, size_t count, size_t stride) void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data) { assert(stride == 4 || stride == 8); - assert(bits >= 1 && bits <= 16); + assert(bits >= 2 && bits <= 16); signed char* d8 = static_cast(destination); short* d16 = static_cast(destination);