mirror of https://github.com/godotengine/godot
Merge pull request #113983 from akien-mga/meshoptimizer-1.0
meshoptimizer: Update to 1.0
This commit is contained in:
commit
5f56002e5b
|
|
@ -695,7 +695,7 @@ Patches:
|
|||
## meshoptimizer
|
||||
|
||||
- Upstream: https://github.com/zeux/meshoptimizer
|
||||
- Version: 0.25 (6daea4695c48338363b08022d2fb15deaef6ac09, 2025)
|
||||
- Version: 1.0 (73583c335e541c139821d0de2bf5f12960a04941, 2025)
|
||||
- License: MIT
|
||||
|
||||
Files extracted from upstream repository:
|
||||
|
|
|
|||
|
|
@ -6,6 +6,17 @@
|
|||
#include <math.h>
|
||||
#include <string.h>
|
||||
|
||||
// The block below auto-detects SIMD ISA that can be used on the target platform
|
||||
#ifndef MESHOPTIMIZER_NO_SIMD
|
||||
#if defined(__SSE2__) || (defined(_MSC_VER) && defined(_M_X64))
|
||||
#define SIMD_SSE
|
||||
#include <emmintrin.h>
|
||||
#elif defined(__aarch64__) || (defined(_MSC_VER) && defined(_M_ARM64) && _MSC_VER >= 1922)
|
||||
#define SIMD_NEON
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
#endif // !MESHOPTIMIZER_NO_SIMD
|
||||
|
||||
// This work is based on:
|
||||
// Graham Wihlidal. Optimizing the Graphics Pipeline with Compute. 2016
|
||||
// Matthaeus Chajdas. GeometryFX 1.2 - Cluster Culling. 2016
|
||||
|
|
@ -149,6 +160,19 @@ static void buildTriangleAdjacencySparse(TriangleAdjacency2& adjacency, const un
|
|||
}
|
||||
}
|
||||
|
||||
static void clearUsed(short* used, size_t vertex_count, const unsigned int* indices, size_t index_count)
|
||||
{
|
||||
// for sparse inputs, it's faster to only clear vertices referenced by the index buffer
|
||||
if (vertex_count <= index_count)
|
||||
memset(used, -1, vertex_count * sizeof(short));
|
||||
else
|
||||
for (size_t i = 0; i < index_count; ++i)
|
||||
{
|
||||
assert(indices[i] < vertex_count);
|
||||
used[indices[i]] = -1;
|
||||
}
|
||||
}
|
||||
|
||||
static void computeBoundingSphere(float result[4], const float* points, size_t count, size_t points_stride, const float* radii, size_t radii_stride, size_t axis_count)
|
||||
{
|
||||
static const float kAxes[7][3] = {
|
||||
|
|
@ -265,21 +289,8 @@ struct Cone
|
|||
float nx, ny, nz;
|
||||
};
|
||||
|
||||
static float getDistance(float dx, float dy, float dz, bool aa)
|
||||
{
|
||||
if (!aa)
|
||||
return sqrtf(dx * dx + dy * dy + dz * dz);
|
||||
|
||||
float rx = fabsf(dx), ry = fabsf(dy), rz = fabsf(dz);
|
||||
float rxy = rx > ry ? rx : ry;
|
||||
return rxy > rz ? rxy : rz;
|
||||
}
|
||||
|
||||
static float getMeshletScore(float distance, float spread, float cone_weight, float expected_radius)
|
||||
{
|
||||
if (cone_weight < 0)
|
||||
return 1 + distance / expected_radius;
|
||||
|
||||
float cone = 1.f - spread * cone_weight;
|
||||
float cone_clamped = cone < 1e-3f ? 1e-3f : cone;
|
||||
|
||||
|
|
@ -348,15 +359,6 @@ static float computeTriangleCones(Cone* triangles, const unsigned int* indices,
|
|||
return mesh_area;
|
||||
}
|
||||
|
||||
static void finishMeshlet(meshopt_Meshlet& meshlet, unsigned char* meshlet_triangles)
|
||||
{
|
||||
size_t offset = meshlet.triangle_offset + meshlet.triangle_count * 3;
|
||||
|
||||
// fill 4b padding with 0
|
||||
while (offset & 3)
|
||||
meshlet_triangles[offset++] = 0;
|
||||
}
|
||||
|
||||
static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int b, unsigned int c, short* used, meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t meshlet_offset, size_t max_vertices, size_t max_triangles, bool split = false)
|
||||
{
|
||||
short& av = used[a];
|
||||
|
|
@ -374,10 +376,8 @@ static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int
|
|||
for (size_t j = 0; j < meshlet.vertex_count; ++j)
|
||||
used[meshlet_vertices[meshlet.vertex_offset + j]] = -1;
|
||||
|
||||
finishMeshlet(meshlet, meshlet_triangles);
|
||||
|
||||
meshlet.vertex_offset += meshlet.vertex_count;
|
||||
meshlet.triangle_offset += (meshlet.triangle_count * 3 + 3) & ~3; // 4b padding
|
||||
meshlet.triangle_offset += meshlet.triangle_count * 3;
|
||||
meshlet.vertex_count = 0;
|
||||
meshlet.triangle_count = 0;
|
||||
|
||||
|
|
@ -453,7 +453,7 @@ static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Co
|
|||
const Cone& tri_cone = triangles[triangle];
|
||||
|
||||
float dx = tri_cone.px - meshlet_cone.px, dy = tri_cone.py - meshlet_cone.py, dz = tri_cone.pz - meshlet_cone.pz;
|
||||
float distance = getDistance(dx, dy, dz, cone_weight < 0);
|
||||
float distance = sqrtf(dx * dx + dy * dy + dz * dz);
|
||||
float spread = tri_cone.nx * meshlet_cone.nx + tri_cone.ny * meshlet_cone.ny + tri_cone.nz * meshlet_cone.nz;
|
||||
|
||||
float score = getMeshletScore(distance, spread, cone_weight, meshlet_expected_radius);
|
||||
|
|
@ -514,7 +514,8 @@ static size_t appendSeedTriangles(unsigned int* seeds, const meshopt_Meshlet& me
|
|||
if (best_neighbor == ~0u)
|
||||
continue;
|
||||
|
||||
float best_neighbor_score = getDistance(triangles[best_neighbor].px - cornerx, triangles[best_neighbor].py - cornery, triangles[best_neighbor].pz - cornerz, false);
|
||||
float dx = triangles[best_neighbor].px - cornerx, dy = triangles[best_neighbor].py - cornery, dz = triangles[best_neighbor].pz - cornerz;
|
||||
float best_neighbor_score = sqrtf(dx * dx + dy * dy + dz * dz);
|
||||
|
||||
for (size_t j = 0; j < kMeshletAddSeeds; ++j)
|
||||
{
|
||||
|
|
@ -566,7 +567,8 @@ static unsigned int selectSeedTriangle(const unsigned int* seeds, size_t seed_co
|
|||
unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];
|
||||
|
||||
unsigned int live = live_triangles[a] + live_triangles[b] + live_triangles[c];
|
||||
float score = getDistance(triangles[index].px - cornerx, triangles[index].py - cornery, triangles[index].pz - cornerz, false);
|
||||
float dx = triangles[index].px - cornerx, dy = triangles[index].py - cornery, dz = triangles[index].pz - cornerz;
|
||||
float score = sqrtf(dx * dx + dy * dy + dz * dz);
|
||||
|
||||
if (live < best_live || (live == best_live && score < best_score))
|
||||
{
|
||||
|
|
@ -587,13 +589,13 @@ struct KDNode
|
|||
unsigned int index;
|
||||
};
|
||||
|
||||
// leaves: axis = 3, children = number of extra points after this one (0 if 'index' is the only point)
|
||||
// leaves: axis = 3, children = number of points including this one
|
||||
// branches: axis != 3, left subtree = skip 1, right subtree = skip 1+children
|
||||
unsigned int axis : 2;
|
||||
unsigned int children : 30;
|
||||
};
|
||||
|
||||
static size_t kdtreePartition(unsigned int* indices, size_t count, const float* points, size_t stride, unsigned int axis, float pivot)
|
||||
static size_t kdtreePartition(unsigned int* indices, size_t count, const float* points, size_t stride, int axis, float pivot)
|
||||
{
|
||||
size_t m = 0;
|
||||
|
||||
|
|
@ -623,7 +625,7 @@ static size_t kdtreeBuildLeaf(size_t offset, KDNode* nodes, size_t node_count, u
|
|||
|
||||
result.index = indices[0];
|
||||
result.axis = 3;
|
||||
result.children = unsigned(count - 1);
|
||||
result.children = unsigned(count);
|
||||
|
||||
// all remaining points are stored in nodes immediately following the leaf
|
||||
for (size_t i = 1; i < count; ++i)
|
||||
|
|
@ -638,7 +640,7 @@ static size_t kdtreeBuildLeaf(size_t offset, KDNode* nodes, size_t node_count, u
|
|||
return offset + count;
|
||||
}
|
||||
|
||||
static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const float* points, size_t stride, unsigned int* indices, size_t count, size_t leaf_size)
|
||||
static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const float* points, size_t stride, unsigned int* indices, size_t count, size_t leaf_size, int depth)
|
||||
{
|
||||
assert(count > 0);
|
||||
assert(offset < node_count);
|
||||
|
|
@ -664,13 +666,14 @@ static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const
|
|||
}
|
||||
|
||||
// split axis is one where the variance is largest
|
||||
unsigned int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2);
|
||||
int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2);
|
||||
|
||||
float split = mean[axis];
|
||||
size_t middle = kdtreePartition(indices, count, points, stride, axis, split);
|
||||
|
||||
// when the partition is degenerate simply consolidate the points into a single node
|
||||
if (middle <= leaf_size / 2 || middle >= count - leaf_size / 2)
|
||||
// this also ensures recursion depth is bounded on pathological inputs
|
||||
if (middle <= leaf_size / 2 || middle >= count - leaf_size / 2 || depth >= kMeshletMaxTreeDepth)
|
||||
return kdtreeBuildLeaf(offset, nodes, node_count, indices, count);
|
||||
|
||||
KDNode& result = nodes[offset];
|
||||
|
|
@ -679,32 +682,40 @@ static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const
|
|||
result.axis = axis;
|
||||
|
||||
// left subtree is right after our node
|
||||
size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size);
|
||||
size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size, depth + 1);
|
||||
|
||||
// distance to the right subtree is represented explicitly
|
||||
assert(next_offset - offset > 1);
|
||||
result.children = unsigned(next_offset - offset - 1);
|
||||
|
||||
return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size);
|
||||
return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size, depth + 1);
|
||||
}
|
||||
|
||||
static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, size_t stride, const unsigned char* emitted_flags, const float* position, bool aa, unsigned int& result, float& limit)
|
||||
static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, size_t stride, const unsigned char* emitted_flags, const float* position, unsigned int& result, float& limit)
|
||||
{
|
||||
const KDNode& node = nodes[root];
|
||||
|
||||
if (node.children == 0)
|
||||
return;
|
||||
|
||||
if (node.axis == 3)
|
||||
{
|
||||
// leaf
|
||||
for (unsigned int i = 0; i <= node.children; ++i)
|
||||
bool inactive = true;
|
||||
|
||||
for (unsigned int i = 0; i < node.children; ++i)
|
||||
{
|
||||
unsigned int index = nodes[root + i].index;
|
||||
|
||||
if (emitted_flags[index])
|
||||
continue;
|
||||
|
||||
inactive = false;
|
||||
|
||||
const float* point = points + index * stride;
|
||||
|
||||
float dx = point[0] - position[0], dy = point[1] - position[1], dz = point[2] - position[2];
|
||||
float distance = getDistance(dx, dy, dz, aa);
|
||||
float distance = sqrtf(dx * dx + dy * dy + dz * dz);
|
||||
|
||||
if (distance < limit)
|
||||
{
|
||||
|
|
@ -712,6 +723,10 @@ static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points,
|
|||
limit = distance;
|
||||
}
|
||||
}
|
||||
|
||||
// deactivate leaves that no longer have items to emit
|
||||
if (inactive)
|
||||
nodes[root].children = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
@ -720,34 +735,87 @@ static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points,
|
|||
unsigned int first = (delta <= 0) ? 0 : node.children;
|
||||
unsigned int second = first ^ node.children;
|
||||
|
||||
kdtreeNearest(nodes, root + 1 + first, points, stride, emitted_flags, position, aa, result, limit);
|
||||
// deactivate branches that no longer have items to emit to accelerate traversal
|
||||
// note that we do this *before* recursing which delays deactivation but keeps tail calls
|
||||
if ((nodes[root + 1 + first].children | nodes[root + 1 + second].children) == 0)
|
||||
nodes[root].children = 0;
|
||||
|
||||
// recursion depth is bounded by tree depth (which is limited by construction)
|
||||
kdtreeNearest(nodes, root + 1 + first, points, stride, emitted_flags, position, result, limit);
|
||||
|
||||
// only process the other node if it can have a match based on closest distance so far
|
||||
if (fabsf(delta) <= limit)
|
||||
kdtreeNearest(nodes, root + 1 + second, points, stride, emitted_flags, position, aa, result, limit);
|
||||
kdtreeNearest(nodes, root + 1 + second, points, stride, emitted_flags, position, result, limit);
|
||||
}
|
||||
}
|
||||
|
||||
struct BVHBoxT
|
||||
{
|
||||
float min[4];
|
||||
float max[4];
|
||||
};
|
||||
|
||||
struct BVHBox
|
||||
{
|
||||
float min[3];
|
||||
float max[3];
|
||||
};
|
||||
|
||||
static void boxMerge(BVHBox& box, const BVHBox& other)
|
||||
#if defined(SIMD_SSE)
|
||||
static float boxMerge(BVHBoxT& box, const BVHBox& other)
|
||||
{
|
||||
__m128 min = _mm_loadu_ps(box.min);
|
||||
__m128 max = _mm_loadu_ps(box.max);
|
||||
|
||||
// note: over-read is safe because BVHBox array is allocated with padding
|
||||
min = _mm_min_ps(min, _mm_loadu_ps(other.min));
|
||||
max = _mm_max_ps(max, _mm_loadu_ps(other.max));
|
||||
|
||||
_mm_storeu_ps(box.min, min);
|
||||
_mm_storeu_ps(box.max, max);
|
||||
|
||||
__m128 size = _mm_sub_ps(max, min);
|
||||
__m128 size_yzx = _mm_shuffle_ps(size, size, _MM_SHUFFLE(0, 0, 2, 1));
|
||||
__m128 mul = _mm_mul_ps(size, size_yzx);
|
||||
__m128 sum_xy = _mm_add_ss(mul, _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(1, 1, 1, 1)));
|
||||
__m128 sum_xyz = _mm_add_ss(sum_xy, _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 2, 2, 2)));
|
||||
|
||||
return _mm_cvtss_f32(sum_xyz);
|
||||
}
|
||||
#elif defined(SIMD_NEON)
|
||||
static float boxMerge(BVHBoxT& box, const BVHBox& other)
|
||||
{
|
||||
float32x4_t min = vld1q_f32(box.min);
|
||||
float32x4_t max = vld1q_f32(box.max);
|
||||
|
||||
// note: over-read is safe because BVHBox array is allocated with padding
|
||||
min = vminq_f32(min, vld1q_f32(other.min));
|
||||
max = vmaxq_f32(max, vld1q_f32(other.max));
|
||||
|
||||
vst1q_f32(box.min, min);
|
||||
vst1q_f32(box.max, max);
|
||||
|
||||
float32x4_t size = vsubq_f32(max, min);
|
||||
float32x4_t size_yzx = vextq_f32(vextq_f32(size, size, 3), size, 2);
|
||||
float32x4_t mul = vmulq_f32(size, size_yzx);
|
||||
float sum_xy = vgetq_lane_f32(mul, 0) + vgetq_lane_f32(mul, 1);
|
||||
float sum_xyz = sum_xy + vgetq_lane_f32(mul, 2);
|
||||
|
||||
return sum_xyz;
|
||||
}
|
||||
#else
|
||||
static float boxMerge(BVHBoxT& box, const BVHBox& other)
|
||||
{
|
||||
for (int k = 0; k < 3; ++k)
|
||||
{
|
||||
box.min[k] = other.min[k] < box.min[k] ? other.min[k] : box.min[k];
|
||||
box.max[k] = other.max[k] > box.max[k] ? other.max[k] : box.max[k];
|
||||
}
|
||||
}
|
||||
|
||||
inline float boxSurface(const BVHBox& box)
|
||||
{
|
||||
float sx = box.max[0] - box.min[0], sy = box.max[1] - box.min[1], sz = box.max[2] - box.min[2];
|
||||
return sx * sy + sx * sz + sy * sz;
|
||||
}
|
||||
#endif
|
||||
|
||||
inline unsigned int radixFloat(unsigned int v)
|
||||
{
|
||||
|
|
@ -833,7 +901,7 @@ static void bvhPrepare(BVHBox* boxes, float* centroids, const unsigned int* indi
|
|||
}
|
||||
}
|
||||
|
||||
static bool bvhPackLeaf(unsigned char* boundary, const unsigned int* order, size_t count, short* used, const unsigned int* indices, size_t max_vertices)
|
||||
static size_t bvhCountVertices(const unsigned int* order, size_t count, short* used, const unsigned int* indices, unsigned int* out = NULL)
|
||||
{
|
||||
// count number of unique vertices
|
||||
size_t used_vertices = 0;
|
||||
|
|
@ -844,6 +912,9 @@ static bool bvhPackLeaf(unsigned char* boundary, const unsigned int* order, size
|
|||
|
||||
used_vertices += (used[a] < 0) + (used[b] < 0) + (used[c] < 0);
|
||||
used[a] = used[b] = used[c] = 1;
|
||||
|
||||
if (out)
|
||||
out[i] = unsigned(used_vertices);
|
||||
}
|
||||
|
||||
// reset used[] for future invocations
|
||||
|
|
@ -855,16 +926,16 @@ static bool bvhPackLeaf(unsigned char* boundary, const unsigned int* order, size
|
|||
used[a] = used[b] = used[c] = -1;
|
||||
}
|
||||
|
||||
if (used_vertices > max_vertices)
|
||||
return false;
|
||||
return used_vertices;
|
||||
}
|
||||
|
||||
static void bvhPackLeaf(unsigned char* boundary, size_t count)
|
||||
{
|
||||
// mark meshlet boundary for future reassembly
|
||||
assert(count > 0);
|
||||
|
||||
boundary[0] = 1;
|
||||
memset(boundary + 1, 0, count - 1);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void bvhPackTail(unsigned char* boundary, const unsigned int* order, size_t count, short* used, const unsigned int* indices, size_t max_vertices, size_t max_triangles)
|
||||
|
|
@ -873,8 +944,9 @@ static void bvhPackTail(unsigned char* boundary, const unsigned int* order, size
|
|||
{
|
||||
size_t chunk = i + max_triangles <= count ? max_triangles : count - i;
|
||||
|
||||
if (bvhPackLeaf(boundary + i, order + i, chunk, used, indices, max_vertices))
|
||||
if (bvhCountVertices(order + i, chunk, used, indices) <= max_vertices)
|
||||
{
|
||||
bvhPackLeaf(boundary + i, chunk);
|
||||
i += chunk;
|
||||
continue;
|
||||
}
|
||||
|
|
@ -882,7 +954,7 @@ static void bvhPackTail(unsigned char* boundary, const unsigned int* order, size
|
|||
// chunk is vertex bound, split it into smaller meshlets
|
||||
assert(chunk > max_vertices / 3);
|
||||
|
||||
bvhPackLeaf(boundary + i, order + i, max_vertices / 3, used, indices, max_vertices);
|
||||
bvhPackLeaf(boundary + i, max_vertices / 3);
|
||||
i += max_vertices / 3;
|
||||
}
|
||||
}
|
||||
|
|
@ -895,25 +967,27 @@ static bool bvhDivisible(size_t count, size_t min, size_t max)
|
|||
return min * 2 <= max ? count >= min : count % min <= (count / min) * (max - min);
|
||||
}
|
||||
|
||||
static size_t bvhPivot(const BVHBox* boxes, const unsigned int* order, size_t count, void* scratch, size_t step, size_t min, size_t max, float fill, float* out_cost)
|
||||
static void bvhComputeArea(float* areas, const BVHBox* boxes, const unsigned int* order, size_t count)
|
||||
{
|
||||
BVHBox accuml = boxes[order[0]], accumr = boxes[order[count - 1]];
|
||||
float* costs = static_cast<float*>(scratch);
|
||||
BVHBoxT accuml = {{FLT_MAX, FLT_MAX, FLT_MAX, 0}, {-FLT_MAX, -FLT_MAX, -FLT_MAX, 0}};
|
||||
BVHBoxT accumr = accuml;
|
||||
|
||||
// accumulate SAH cost in forward and backward directions
|
||||
for (size_t i = 0; i < count; ++i)
|
||||
{
|
||||
boxMerge(accuml, boxes[order[i]]);
|
||||
boxMerge(accumr, boxes[order[count - 1 - i]]);
|
||||
float larea = boxMerge(accuml, boxes[order[i]]);
|
||||
float rarea = boxMerge(accumr, boxes[order[count - 1 - i]]);
|
||||
|
||||
costs[i] = boxSurface(accuml);
|
||||
costs[i + count] = boxSurface(accumr);
|
||||
areas[i] = larea;
|
||||
areas[i + count] = rarea;
|
||||
}
|
||||
}
|
||||
|
||||
static size_t bvhPivot(const float* areas, const unsigned int* vertices, size_t count, size_t step, size_t min, size_t max, float fill, size_t maxfill, float* out_cost)
|
||||
{
|
||||
bool aligned = count >= min * 2 && bvhDivisible(count, min, max);
|
||||
size_t end = aligned ? count - min : count - 1;
|
||||
|
||||
float rmaxf = 1.f / float(int(max));
|
||||
float rmaxfill = 1.f / float(int(maxfill));
|
||||
|
||||
// find best split that minimizes SAH
|
||||
size_t bestsplit = 0;
|
||||
|
|
@ -928,17 +1002,22 @@ static size_t bvhPivot(const BVHBox* boxes, const unsigned int* order, size_t co
|
|||
if (aligned && !bvhDivisible(rsplit, min, max))
|
||||
continue;
|
||||
|
||||
// costs[x] = inclusive surface area of boxes[0..x]
|
||||
// costs[count-1-x] = inclusive surface area of boxes[x..count-1]
|
||||
float larea = costs[i], rarea = costs[(count - 1 - (i + 1)) + count];
|
||||
// areas[x] = inclusive surface area of boxes[0..x]
|
||||
// areas[count-1-x] = inclusive surface area of boxes[x..count-1]
|
||||
float larea = areas[i], rarea = areas[(count - 1 - (i + 1)) + count];
|
||||
float cost = larea * float(int(lsplit)) + rarea * float(int(rsplit));
|
||||
|
||||
if (cost > bestcost)
|
||||
continue;
|
||||
|
||||
// fill cost; use floating point math to avoid expensive integer modulo
|
||||
int lrest = int(float(int(lsplit + max - 1)) * rmaxf) * int(max) - int(lsplit);
|
||||
int rrest = int(float(int(rsplit + max - 1)) * rmaxf) * int(max) - int(rsplit);
|
||||
// use vertex fill when splitting vertex limited clusters; note that we use the same (left->right) vertex count
|
||||
// using bidirectional vertex counts is a little more expensive to compute and produces slightly worse results in practice
|
||||
size_t lfill = vertices ? vertices[i] : lsplit;
|
||||
size_t rfill = vertices ? vertices[i] : rsplit;
|
||||
|
||||
// fill cost; use floating point math to round up to maxfill to avoid expensive integer modulo
|
||||
int lrest = int(float(int(lfill + maxfill - 1)) * rmaxfill) * int(maxfill) - int(lfill);
|
||||
int rrest = int(float(int(rfill + maxfill - 1)) * rmaxfill) * int(maxfill) - int(rfill);
|
||||
|
||||
cost += fill * (float(lrest) * larea + float(rrest) * rarea);
|
||||
|
||||
|
|
@ -971,11 +1050,8 @@ static void bvhPartition(unsigned int* target, const unsigned int* order, const
|
|||
|
||||
static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* ordery, unsigned int* orderz, unsigned char* boundary, size_t count, int depth, void* scratch, short* used, const unsigned int* indices, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight)
|
||||
{
|
||||
if (depth >= kMeshletMaxTreeDepth)
|
||||
return bvhPackTail(boundary, orderx, count, used, indices, max_vertices, max_triangles);
|
||||
|
||||
if (count <= max_triangles && bvhPackLeaf(boundary, orderx, count, used, indices, max_vertices))
|
||||
return;
|
||||
if (count <= max_triangles && bvhCountVertices(orderx, count, used, indices) <= max_vertices)
|
||||
return bvhPackLeaf(boundary, count);
|
||||
|
||||
unsigned int* axes[3] = {orderx, ordery, orderz};
|
||||
|
||||
|
|
@ -984,9 +1060,7 @@ static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* or
|
|||
|
||||
// if we could not pack the meshlet, we must be vertex bound
|
||||
size_t mint = count <= max_triangles && max_vertices / 3 < min_triangles ? max_vertices / 3 : min_triangles;
|
||||
|
||||
// only use fill weight if we are optimizing for triangle count
|
||||
float fill = count <= max_triangles ? 0.f : fill_weight;
|
||||
size_t maxfill = count <= max_triangles ? max_vertices : max_triangles;
|
||||
|
||||
// find best split that minimizes SAH
|
||||
int bestk = -1;
|
||||
|
|
@ -995,8 +1069,20 @@ static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* or
|
|||
|
||||
for (int k = 0; k < 3; ++k)
|
||||
{
|
||||
float* areas = static_cast<float*>(scratch);
|
||||
unsigned int* vertices = NULL;
|
||||
|
||||
bvhComputeArea(areas, boxes, axes[k], count);
|
||||
|
||||
if (count <= max_triangles)
|
||||
{
|
||||
// for vertex bound clusters, count number of unique vertices for each split
|
||||
vertices = reinterpret_cast<unsigned int*>(areas + 2 * count);
|
||||
bvhCountVertices(axes[k], count, used, indices, vertices);
|
||||
}
|
||||
|
||||
float axiscost = FLT_MAX;
|
||||
size_t axissplit = bvhPivot(boxes, axes[k], count, scratch, step, mint, max_triangles, fill, &axiscost);
|
||||
size_t axissplit = bvhPivot(areas, vertices, count, step, mint, max_triangles, fill_weight, maxfill, &axiscost);
|
||||
|
||||
if (axissplit && axiscost < bestcost)
|
||||
{
|
||||
|
|
@ -1006,8 +1092,8 @@ static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* or
|
|||
}
|
||||
}
|
||||
|
||||
// this may happen if SAH costs along the admissible splits are NaN
|
||||
if (bestk < 0)
|
||||
// this may happen if SAH costs along the admissible splits are NaN, or due to imbalanced splits on pathological inputs
|
||||
if (bestk < 0 || depth >= kMeshletMaxTreeDepth)
|
||||
return bvhPackTail(boundary, orderx, count, used, indices, max_vertices, max_triangles);
|
||||
|
||||
// mark sides of split for partitioning
|
||||
|
|
@ -1032,6 +1118,7 @@ static void bvhSplit(const BVHBox* boxes, unsigned int* orderx, unsigned int* or
|
|||
bvhPartition(axis, temp, sides, bestsplit, count);
|
||||
}
|
||||
|
||||
// recursion depth is bounded due to max depth check above
|
||||
bvhSplit(boxes, orderx, ordery, orderz, boundary, bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);
|
||||
bvhSplit(boxes, orderx + bestsplit, ordery + bestsplit, orderz + bestsplit, boundary + bestsplit, count - bestsplit, depth + 1, scratch, used, indices, max_vertices, min_triangles, max_triangles, fill_weight);
|
||||
}
|
||||
|
|
@ -1045,7 +1132,6 @@ size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_
|
|||
assert(index_count % 3 == 0);
|
||||
assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
|
||||
assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
|
||||
assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
|
||||
|
||||
(void)kMeshletMaxVertices;
|
||||
(void)kMeshletMaxTriangles;
|
||||
|
|
@ -1070,9 +1156,8 @@ size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshle
|
|||
|
||||
assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
|
||||
assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= kMeshletMaxTriangles);
|
||||
assert(min_triangles % 4 == 0 && max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
|
||||
|
||||
assert(cone_weight <= 1); // negative cone weight switches metric to optimize for axis-aligned meshlets
|
||||
assert(cone_weight >= 0 && cone_weight <= 1);
|
||||
assert(split_factor >= 0);
|
||||
|
||||
if (index_count == 0)
|
||||
|
|
@ -1108,7 +1193,7 @@ size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshle
|
|||
kdindices[i] = unsigned(i);
|
||||
|
||||
KDNode* nodes = allocator.allocate<KDNode>(face_count * 2);
|
||||
kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8);
|
||||
kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8, 0);
|
||||
|
||||
// find a specific corner of the mesh to use as a starting point for meshlet flow
|
||||
float cornerx = FLT_MAX, cornery = FLT_MAX, cornerz = FLT_MAX;
|
||||
|
|
@ -1124,7 +1209,7 @@ size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshle
|
|||
|
||||
// index of the vertex in the meshlet, -1 if the vertex isn't used
|
||||
short* used = allocator.allocate<short>(vertex_count);
|
||||
memset(used, -1, vertex_count * sizeof(short));
|
||||
clearUsed(used, vertex_count, indices, index_count);
|
||||
|
||||
// initial seed triangle is the one closest to the corner
|
||||
unsigned int initial_seed = ~0u;
|
||||
|
|
@ -1134,7 +1219,8 @@ size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshle
|
|||
{
|
||||
const Cone& tri = triangles[i];
|
||||
|
||||
float score = getDistance(tri.px - cornerx, tri.py - cornery, tri.pz - cornerz, false);
|
||||
float dx = tri.px - cornerx, dy = tri.py - cornery, dz = tri.pz - cornerz;
|
||||
float score = sqrtf(dx * dx + dy * dy + dz * dz);
|
||||
|
||||
if (initial_seed == ~0u || score < initial_score)
|
||||
{
|
||||
|
|
@ -1174,7 +1260,7 @@ size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshle
|
|||
unsigned int index = ~0u;
|
||||
float distance = FLT_MAX;
|
||||
|
||||
kdtreeNearest(nodes, 0, &triangles[0].px, sizeof(Cone) / sizeof(float), emitted_flags, position, cone_weight < 0.f, index, distance);
|
||||
kdtreeNearest(nodes, 0, &triangles[0].px, sizeof(Cone) / sizeof(float), emitted_flags, position, index, distance);
|
||||
|
||||
best_triangle = index;
|
||||
split = meshlet.triangle_count >= min_triangles && split_factor > 0 && distance > meshlet_expected_radius * split_factor;
|
||||
|
|
@ -1244,20 +1330,15 @@ size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshle
|
|||
}
|
||||
|
||||
if (meshlet.triangle_count)
|
||||
{
|
||||
finishMeshlet(meshlet, meshlet_triangles);
|
||||
|
||||
meshlets[meshlet_offset++] = meshlet;
|
||||
}
|
||||
|
||||
assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, min_triangles));
|
||||
assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count);
|
||||
return meshlet_offset;
|
||||
}
|
||||
|
||||
size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight)
|
||||
{
|
||||
assert(cone_weight >= 0); // to use negative cone weight, use meshopt_buildMeshletsFlex
|
||||
|
||||
return meshopt_buildMeshletsFlex(meshlets, meshlet_vertices, meshlet_triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, max_triangles, max_triangles, cone_weight, 0.0f);
|
||||
}
|
||||
|
||||
|
|
@ -1269,13 +1350,12 @@ size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshle
|
|||
|
||||
assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
|
||||
assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
|
||||
assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
|
||||
|
||||
meshopt_Allocator allocator;
|
||||
|
||||
// index of the vertex in the meshlet, -1 if the vertex isn't used
|
||||
short* used = allocator.allocate<short>(vertex_count);
|
||||
memset(used, -1, vertex_count * sizeof(short));
|
||||
clearUsed(used, vertex_count, indices, index_count);
|
||||
|
||||
meshopt_Meshlet meshlet = {};
|
||||
size_t meshlet_offset = 0;
|
||||
|
|
@ -1290,13 +1370,10 @@ size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshle
|
|||
}
|
||||
|
||||
if (meshlet.triangle_count)
|
||||
{
|
||||
finishMeshlet(meshlet, meshlet_triangles);
|
||||
|
||||
meshlets[meshlet_offset++] = meshlet;
|
||||
}
|
||||
|
||||
assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));
|
||||
assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count);
|
||||
return meshlet_offset;
|
||||
}
|
||||
|
||||
|
|
@ -1310,7 +1387,6 @@ size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned i
|
|||
|
||||
assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
|
||||
assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= kMeshletMaxTriangles);
|
||||
assert(min_triangles % 4 == 0 && max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
|
||||
|
||||
if (index_count == 0)
|
||||
return 0;
|
||||
|
|
@ -1321,13 +1397,14 @@ size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned i
|
|||
meshopt_Allocator allocator;
|
||||
|
||||
// 3 floats plus 1 uint for sorting, or
|
||||
// 2 floats for SAH costs, or
|
||||
// 2 floats plus 1 uint for pivoting, or
|
||||
// 1 uint plus 1 byte for partitioning
|
||||
float* scratch = allocator.allocate<float>(face_count * 4);
|
||||
|
||||
// compute bounding boxes and centroids for sorting
|
||||
BVHBox* boxes = allocator.allocate<BVHBox>(face_count);
|
||||
BVHBox* boxes = allocator.allocate<BVHBox>(face_count + 1); // padding for SIMD
|
||||
bvhPrepare(boxes, scratch, indices, face_count, vertex_positions, vertex_count, vertex_stride_float);
|
||||
memset(boxes + face_count, 0, sizeof(BVHBox));
|
||||
|
||||
unsigned int* axes = allocator.allocate<unsigned int>(face_count * 3);
|
||||
unsigned int* temp = reinterpret_cast<unsigned int*>(scratch) + face_count * 3;
|
||||
|
|
@ -1351,7 +1428,7 @@ size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned i
|
|||
|
||||
// index of the vertex in the meshlet, -1 if the vertex isn't used
|
||||
short* used = allocator.allocate<short>(vertex_count);
|
||||
memset(used, -1, vertex_count * sizeof(short));
|
||||
clearUsed(used, vertex_count, indices, index_count);
|
||||
|
||||
unsigned char* boundary = allocator.allocate<unsigned char>(face_count);
|
||||
|
||||
|
|
@ -1392,13 +1469,10 @@ size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned i
|
|||
}
|
||||
|
||||
if (meshlet.triangle_count)
|
||||
{
|
||||
finishMeshlet(meshlet, meshlet_triangles);
|
||||
|
||||
meshlets[meshlet_offset++] = meshlet;
|
||||
}
|
||||
|
||||
assert(meshlet_offset <= meshlet_bound);
|
||||
assert(meshlet.triangle_offset + meshlet.triangle_count * 3 <= index_count && meshlet.vertex_offset + meshlet.vertex_count <= index_count);
|
||||
return meshlet_offset;
|
||||
}
|
||||
|
||||
|
|
@ -1694,3 +1768,6 @@ void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* mesh
|
|||
assert(vertex_offset <= vertex_count);
|
||||
memcpy(vertices, order, vertex_offset * sizeof(unsigned int));
|
||||
}
|
||||
|
||||
#undef SIMD_SSE
|
||||
#undef SIMD_NEON
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
/**
|
||||
* meshoptimizer - version 0.25
|
||||
* meshoptimizer - version 1.0
|
||||
*
|
||||
* Copyright (C) 2016-2025, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
|
||||
* Report bugs and download new versions at https://github.com/zeux/meshoptimizer
|
||||
|
|
@ -12,7 +12,7 @@
|
|||
#include <stddef.h>
|
||||
|
||||
/* Version macro; major * 1000 + minor * 10 + patch */
|
||||
#define MESHOPTIMIZER_VERSION 250 /* 0.25 */
|
||||
#define MESHOPTIMIZER_VERSION 1000 /* 1.0 */
|
||||
|
||||
/* If no API is defined, assume default */
|
||||
#ifndef MESHOPTIMIZER_API
|
||||
|
|
@ -125,14 +125,14 @@ MESHOPTIMIZER_API void meshopt_generateShadowIndexBuffer(unsigned int* destinati
|
|||
MESHOPTIMIZER_API void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count);
|
||||
|
||||
/**
|
||||
* Experimental: Generates a remap table that maps all vertices with the same position to the same (existing) index.
|
||||
* Generates a remap table that maps all vertices with the same position to the same (existing) index.
|
||||
* Similarly to meshopt_generateShadowIndexBuffer, this can be helpful to pre-process meshes for position-only rendering.
|
||||
* This can also be used to implement algorithms that require positional-only connectivity, such as hierarchical simplification.
|
||||
*
|
||||
* destination must contain enough space for the resulting remap table (vertex_count elements)
|
||||
* vertex_positions should have float3 position in the first 12 bytes of each vertex
|
||||
*/
|
||||
MESHOPTIMIZER_EXPERIMENTAL void meshopt_generatePositionRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
|
||||
MESHOPTIMIZER_API void meshopt_generatePositionRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
|
||||
|
||||
/**
|
||||
* Generate index buffer that can be used as a geometry shader input with triangle adjacency topology
|
||||
|
|
@ -179,7 +179,7 @@ MESHOPTIMIZER_API size_t meshopt_generateProvokingIndexBuffer(unsigned int* dest
|
|||
/**
|
||||
* Vertex transform cache optimizer
|
||||
* Reorders indices to reduce the number of GPU vertex shader invocations
|
||||
* If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
|
||||
* If index buffer contains multiple ranges for multiple draw calls, this function needs to be called on each range individually.
|
||||
*
|
||||
* destination must contain enough space for the resulting index buffer (index_count elements)
|
||||
*/
|
||||
|
|
@ -198,7 +198,7 @@ MESHOPTIMIZER_API void meshopt_optimizeVertexCacheStrip(unsigned int* destinatio
|
|||
* Vertex transform cache optimizer for FIFO caches
|
||||
* Reorders indices to reduce the number of GPU vertex shader invocations
|
||||
* Generally takes ~3x less time to optimize meshes but produces inferior results compared to meshopt_optimizeVertexCache
|
||||
* If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
|
||||
* If index buffer contains multiple ranges for multiple draw calls, this function needs to be called on each range individually.
|
||||
*
|
||||
* destination must contain enough space for the resulting index buffer (index_count elements)
|
||||
* cache_size should be less than the actual GPU cache size to avoid cache thrashing
|
||||
|
|
@ -208,7 +208,7 @@ MESHOPTIMIZER_API void meshopt_optimizeVertexCacheFifo(unsigned int* destination
|
|||
/**
|
||||
* Overdraw optimizer
|
||||
* Reorders indices to reduce the number of GPU vertex shader invocations and the pixel overdraw
|
||||
* If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
|
||||
* If index buffer contains multiple ranges for multiple draw calls, this function needs to be called on each range individually.
|
||||
*
|
||||
* destination must contain enough space for the resulting index buffer (index_count elements)
|
||||
* indices must contain index data that is the result of meshopt_optimizeVertexCache (*not* the original mesh indices!)
|
||||
|
|
@ -221,7 +221,7 @@ MESHOPTIMIZER_API void meshopt_optimizeOverdraw(unsigned int* destination, const
|
|||
* Vertex fetch cache optimizer
|
||||
* Reorders vertices and changes indices to reduce the amount of GPU memory fetches during vertex processing
|
||||
* Returns the number of unique vertices, which is the same as input vertex count unless some vertices are unused
|
||||
* This functions works for a single vertex stream; for multiple vertex streams, use meshopt_optimizeVertexFetchRemap + meshopt_remapVertexBuffer for each stream.
|
||||
* This function works for a single vertex stream; for multiple vertex streams, use meshopt_optimizeVertexFetchRemap + meshopt_remapVertexBuffer for each stream.
|
||||
*
|
||||
* destination must contain enough space for the resulting vertex buffer (vertex_count elements)
|
||||
* indices is used both as an input and as an output index buffer
|
||||
|
|
@ -251,7 +251,8 @@ MESHOPTIMIZER_API size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t
|
|||
MESHOPTIMIZER_API size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count);
|
||||
|
||||
/**
|
||||
* Set index encoder format version
|
||||
* Set index encoder format version (defaults to 1)
|
||||
*
|
||||
* version must specify the data format version to encode; valid values are 0 (decodable by all library versions) and 1 (decodable by 0.14+)
|
||||
*/
|
||||
MESHOPTIMIZER_API void meshopt_encodeIndexVersion(int version);
|
||||
|
|
@ -303,6 +304,7 @@ MESHOPTIMIZER_API int meshopt_decodeIndexSequence(void* destination, size_t inde
|
|||
* For maximum efficiency the vertex buffer being encoded has to be quantized and optimized for locality of reference (cache/fetch) first.
|
||||
*
|
||||
* buffer must contain enough space for the encoded vertex buffer (use meshopt_encodeVertexBufferBound to compute worst case size)
|
||||
* vertex_size must be a multiple of 4 (and <= 256)
|
||||
*/
|
||||
MESHOPTIMIZER_API size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size);
|
||||
MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size);
|
||||
|
|
@ -313,13 +315,16 @@ MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferBound(size_t vertex_count, si
|
|||
* For compression level to take effect, the vertex encoding version must be set to 1.
|
||||
* The default compression level implied by meshopt_encodeVertexBuffer is 2.
|
||||
*
|
||||
* buffer must contain enough space for the encoded vertex buffer (use meshopt_encodeVertexBufferBound to compute worst case size)
|
||||
* vertex_size must be a multiple of 4 (and <= 256)
|
||||
* level should be in the range [0, 3] with 0 being the fastest and 3 being the slowest and producing the best compression ratio.
|
||||
* version should be -1 to use the default version (specified via meshopt_encodeVertexVersion), or 0/1 to override the version; per above, level won't take effect if version is 0.
|
||||
*/
|
||||
MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level, int version);
|
||||
|
||||
/**
|
||||
* Set vertex encoder format version
|
||||
* Set vertex encoder format version (defaults to 1)
|
||||
*
|
||||
* version must specify the data format version to encode; valid values are 0 (decodable by all library versions) and 1 (decodable by 0.23+)
|
||||
*/
|
||||
MESHOPTIMIZER_API void meshopt_encodeVertexVersion(int version);
|
||||
|
|
@ -331,6 +336,7 @@ MESHOPTIMIZER_API void meshopt_encodeVertexVersion(int version);
|
|||
* The decoder is safe to use for untrusted input, but it may produce garbage data.
|
||||
*
|
||||
* destination must contain enough space for the resulting vertex buffer (vertex_count * vertex_size bytes)
|
||||
* vertex_size must be a multiple of 4 (and <= 256)
|
||||
*/
|
||||
MESHOPTIMIZER_API int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size);
|
||||
|
||||
|
|
@ -345,29 +351,29 @@ MESHOPTIMIZER_API int meshopt_decodeVertexVersion(const unsigned char* buffer, s
|
|||
* Vertex buffer filters
|
||||
* These functions can be used to filter output of meshopt_decodeVertexBuffer in-place.
|
||||
*
|
||||
* meshopt_decodeFilterOct decodes octahedral encoding of a unit vector with K-bit (K <= 16) signed X/Y as an input; Z must store 1.0f.
|
||||
* meshopt_decodeFilterOct decodes octahedral encoding of a unit vector with K-bit signed X/Y as an input; Z must store 1.0f.
|
||||
* Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. W is preserved as is.
|
||||
*
|
||||
* meshopt_decodeFilterQuat decodes 3-component quaternion encoding with K-bit (4 <= K <= 16) component encoding and a 2-bit component index indicating which component to reconstruct.
|
||||
* meshopt_decodeFilterQuat decodes 3-component quaternion encoding with K-bit component encoding and a 2-bit component index indicating which component to reconstruct.
|
||||
* Each component is stored as an 16-bit integer; stride must be equal to 8.
|
||||
*
|
||||
* meshopt_decodeFilterExp decodes exponential encoding of floating-point data with 8-bit exponent and 24-bit integer mantissa as 2^E*M.
|
||||
* Each 32-bit component is decoded in isolation; stride must be divisible by 4.
|
||||
*
|
||||
* Experimental: meshopt_decodeFilterColor decodes YCoCg (+A) color encoding where RGB is converted to YCoCg space with variable bit quantization.
|
||||
* meshopt_decodeFilterColor decodes RGBA colors from YCoCg (+A) color encoding where RGB is converted to YCoCg space with K-bit component encoding, and A is stored using K-1 bits.
|
||||
* Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8.
|
||||
*/
|
||||
MESHOPTIMIZER_API void meshopt_decodeFilterOct(void* buffer, size_t count, size_t stride);
|
||||
MESHOPTIMIZER_API void meshopt_decodeFilterQuat(void* buffer, size_t count, size_t stride);
|
||||
MESHOPTIMIZER_API void meshopt_decodeFilterExp(void* buffer, size_t count, size_t stride);
|
||||
MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterColor(void* buffer, size_t count, size_t stride);
|
||||
MESHOPTIMIZER_API void meshopt_decodeFilterColor(void* buffer, size_t count, size_t stride);
|
||||
|
||||
/**
|
||||
* Vertex buffer filter encoders
|
||||
* These functions can be used to encode data in a format that meshopt_decodeFilter can decode
|
||||
*
|
||||
* meshopt_encodeFilterOct encodes unit vectors with K-bit (K <= 16) signed X/Y as an output.
|
||||
* Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. W is preserved as is.
|
||||
* meshopt_encodeFilterOct encodes unit vectors with K-bit (2 <= K <= 16) signed X/Y as an output.
|
||||
* Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. Z will store 1.0f, W is preserved as is.
|
||||
* Input data must contain 4 floats for every vector (count*4 total).
|
||||
*
|
||||
* meshopt_encodeFilterQuat encodes unit quaternions with K-bit (4 <= K <= 16) component encoding.
|
||||
|
|
@ -378,7 +384,7 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterColor(void* buffer, size_t c
|
|||
* Exponent can be shared between all components of a given vector as defined by stride or all values of a given component; stride must be divisible by 4.
|
||||
* Input data must contain stride/4 floats for every vector (count*stride/4 total).
|
||||
*
|
||||
* Experimental: meshopt_encodeFilterColor encodes RGBA color data by converting RGB to YCoCg color space with variable bit quantization.
|
||||
* meshopt_encodeFilterColor encodes RGBA color data by converting RGB to YCoCg color space with K-bit (2 <= K <= 16) component encoding; A is stored using K-1 bits.
|
||||
* Each component is stored as an 8-bit or 16-bit integer; stride must be equal to 4 or 8.
|
||||
* Input data must contain 4 floats for every color (count*4 total).
|
||||
*/
|
||||
|
|
@ -397,7 +403,7 @@ enum meshopt_EncodeExpMode
|
|||
MESHOPTIMIZER_API void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data);
|
||||
MESHOPTIMIZER_API void meshopt_encodeFilterQuat(void* destination, size_t count, size_t stride, int bits, const float* data);
|
||||
MESHOPTIMIZER_API void meshopt_encodeFilterExp(void* destination, size_t count, size_t stride, int bits, const float* data, enum meshopt_EncodeExpMode mode);
|
||||
MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterColor(void* destination, size_t count, size_t stride, int bits, const float* data);
|
||||
MESHOPTIMIZER_API void meshopt_encodeFilterColor(void* destination, size_t count, size_t stride, int bits, const float* data);
|
||||
|
||||
/**
|
||||
* Simplification options
|
||||
|
|
@ -412,7 +418,7 @@ enum
|
|||
meshopt_SimplifyErrorAbsolute = 1 << 2,
|
||||
/* Remove disconnected parts of the mesh during simplification incrementally, regardless of the topological restrictions inside components. */
|
||||
meshopt_SimplifyPrune = 1 << 3,
|
||||
/* Experimental: Produce more regular triangle sizes and shapes during simplification, at some cost to geometric quality. */
|
||||
/* Produce more regular triangle sizes and shapes during simplification, at some cost to geometric and attribute quality. */
|
||||
meshopt_SimplifyRegularize = 1 << 4,
|
||||
/* Experimental: Allow collapses across attribute discontinuities, except for vertices that are tagged with meshopt_SimplifyVertex_Protect in vertex_lock. */
|
||||
meshopt_SimplifyPermissive = 1 << 5,
|
||||
|
|
@ -472,7 +478,7 @@ MESHOPTIMIZER_API size_t meshopt_simplify(unsigned int* destination, const unsig
|
|||
MESHOPTIMIZER_API size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
|
||||
|
||||
/**
|
||||
* Experimental: Mesh simplifier with position/attribute update
|
||||
* Mesh simplifier with position/attribute update
|
||||
* Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible.
|
||||
* Similar to meshopt_simplifyWithAttributes, but destructively updates positions and attribute values for optimal appearance.
|
||||
* The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error.
|
||||
|
|
@ -492,10 +498,10 @@ MESHOPTIMIZER_API size_t meshopt_simplifyWithAttributes(unsigned int* destinatio
|
|||
* options must be a bitmask composed of meshopt_SimplifyX options; 0 is a safe default
|
||||
* result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
|
||||
*/
|
||||
MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithUpdate(unsigned int* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
|
||||
MESHOPTIMIZER_API size_t meshopt_simplifyWithUpdate(unsigned int* indices, size_t index_count, float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
|
||||
|
||||
/**
|
||||
* Experimental: Mesh simplifier (sloppy)
|
||||
* Mesh simplifier (sloppy)
|
||||
* Reduces the number of triangles in the mesh, sacrificing mesh appearance for simplification performance
|
||||
* The algorithm doesn't preserve mesh topology but can stop short of the target goal based on target error.
|
||||
* Returns the number of indices after simplification, with destination containing new index data
|
||||
|
|
@ -508,7 +514,7 @@ MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithUpdate(unsigned int* indic
|
|||
* target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
|
||||
* result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
|
||||
*/
|
||||
MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error);
|
||||
MESHOPTIMIZER_API size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error);
|
||||
|
||||
/**
|
||||
* Mesh simplifier (pruner)
|
||||
|
|
@ -639,7 +645,7 @@ struct meshopt_Meshlet
|
|||
unsigned int vertex_offset;
|
||||
unsigned int triangle_offset;
|
||||
|
||||
/* number of vertices and triangles used in the meshlet; data is stored in consecutive range defined by offset and count */
|
||||
/* number of vertices and triangles used in the meshlet; data is stored in consecutive range [offset..offset+count) for vertices and [offset..offset+count*3) for triangles */
|
||||
unsigned int vertex_count;
|
||||
unsigned int triangle_count;
|
||||
};
|
||||
|
|
@ -653,10 +659,10 @@ struct meshopt_Meshlet
|
|||
* When using buildMeshletsScan, for maximum efficiency the index buffer being converted has to be optimized for vertex cache first.
|
||||
*
|
||||
* meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound
|
||||
* meshlet_vertices must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_vertices
|
||||
* meshlet_triangles must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_triangles * 3
|
||||
* meshlet_vertices must contain enough space for all meshlets, worst case is index_count elements (*not* vertex_count!)
|
||||
* meshlet_triangles must contain enough space for all meshlets, worst case is index_count elements
|
||||
* vertex_positions should have float3 position in the first 12 bytes of each vertex
|
||||
* max_vertices and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; max_triangles must be divisible by 4)
|
||||
* max_vertices and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512)
|
||||
* cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency
|
||||
*/
|
||||
MESHOPTIMIZER_API size_t meshopt_buildMeshlets(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight);
|
||||
|
|
@ -664,40 +670,38 @@ MESHOPTIMIZER_API size_t meshopt_buildMeshletsScan(struct meshopt_Meshlet* meshl
|
|||
MESHOPTIMIZER_API size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles);
|
||||
|
||||
/**
|
||||
* Experimental: Meshlet builder with flexible cluster sizes
|
||||
* Meshlet builder with flexible cluster sizes
|
||||
* Splits the mesh into a set of meshlets, similarly to meshopt_buildMeshlets, but allows to specify minimum and maximum number of triangles per meshlet.
|
||||
* Clusters between min and max triangle counts are split when the cluster size would have exceeded the expected cluster size by more than split_factor.
|
||||
* Additionally, allows to switch to axis aligned clusters by setting cone_weight to a negative value.
|
||||
*
|
||||
* meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound using min_triangles (not max!)
|
||||
* meshlet_vertices must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_vertices
|
||||
* meshlet_triangles must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_triangles * 3
|
||||
* meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound using min_triangles (*not* max!)
|
||||
* meshlet_vertices must contain enough space for all meshlets, worst case is index_count elements (*not* vertex_count!)
|
||||
* meshlet_triangles must contain enough space for all meshlets, worst case is index_count elements
|
||||
* vertex_positions should have float3 position in the first 12 bytes of each vertex
|
||||
* max_vertices, min_triangles and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; min_triangles <= max_triangles; both min_triangles and max_triangles must be divisible by 4)
|
||||
* cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency; additionally, cone_weight can be set to a negative value to prioritize axis aligned clusters (for raytracing) instead
|
||||
* max_vertices, min_triangles and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; min_triangles <= max_triangles)
|
||||
* cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency
|
||||
* split_factor should be set to a non-negative value; when greater than 0, clusters that have large bounds may be split unless they are under the min_triangles threshold
|
||||
*/
|
||||
MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsFlex(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor);
|
||||
MESHOPTIMIZER_API size_t meshopt_buildMeshletsFlex(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor);
|
||||
|
||||
/**
|
||||
* Experimental: Meshlet builder that produces clusters optimized for raytracing
|
||||
* Meshlet builder that produces clusters optimized for raytracing
|
||||
* Splits the mesh into a set of meshlets, similarly to meshopt_buildMeshlets, but optimizes cluster subdivision for raytracing and allows to specify minimum and maximum number of triangles per meshlet.
|
||||
*
|
||||
* meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound using min_triangles (not max!)
|
||||
* meshlet_vertices must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_vertices
|
||||
* meshlet_triangles must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_triangles * 3
|
||||
* meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound using min_triangles (*not* max!)
|
||||
* meshlet_vertices must contain enough space for all meshlets, worst case is index_count elements (*not* vertex_count!)
|
||||
* meshlet_triangles must contain enough space for all meshlets, worst case is index_count elements
|
||||
* vertex_positions should have float3 position in the first 12 bytes of each vertex
|
||||
* max_vertices, min_triangles and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; min_triangles <= max_triangles; both min_triangles and max_triangles must be divisible by 4)
|
||||
* max_vertices, min_triangles and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; min_triangles <= max_triangles)
|
||||
* fill_weight allows to prioritize clusters that are closer to maximum size at some cost to SAH quality; 0.5 is a safe default
|
||||
*/
|
||||
MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight);
|
||||
MESHOPTIMIZER_API size_t meshopt_buildMeshletsSpatial(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float fill_weight);
|
||||
|
||||
/**
|
||||
* Meshlet optimizer
|
||||
* Reorders meshlet vertices and triangles to maximize locality to improve rasterizer throughput
|
||||
* Reorders meshlet vertices and triangles to maximize locality which can improve rasterizer throughput or ray tracing performance when using fast-build modes.
|
||||
*
|
||||
* meshlet_triangles and meshlet_vertices must refer to meshlet triangle and vertex index data; when buildMeshlets* is used, these
|
||||
* need to be computed from meshlet's vertex_offset and triangle_offset
|
||||
* meshlet_triangles and meshlet_vertices must refer to meshlet data; when buildMeshlets* is used, these need to be computed from meshlet's vertex_offset and triangle_offset
|
||||
* triangle_count and vertex_count must not exceed implementation limits (vertex_count <= 256, triangle_count <= 512)
|
||||
*/
|
||||
MESHOPTIMIZER_API void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count);
|
||||
|
|
@ -756,13 +760,14 @@ MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeSphereBounds(const float*
|
|||
/**
|
||||
* Cluster partitioner
|
||||
* Partitions clusters into groups of similar size, prioritizing grouping clusters that share vertices or are close to each other.
|
||||
* When vertex positions are not provided, only clusters that share vertices will be grouped together, which may result in small partitions for some inputs.
|
||||
*
|
||||
* destination must contain enough space for the resulting partition data (cluster_count elements)
|
||||
* destination[i] will contain the partition id for cluster i, with the total number of partitions returned by the function
|
||||
* cluster_indices should have the vertex indices referenced by each cluster, stored sequentially
|
||||
* cluster_index_counts should have the number of indices in each cluster; sum of all cluster_index_counts must be equal to total_index_count
|
||||
* vertex_positions should have float3 position in the first 12 bytes of each vertex (or can be NULL if not used)
|
||||
* target_partition_size is a target size for each partition, in clusters; the resulting partitions may be smaller or larger
|
||||
* vertex_positions can be NULL; when it's not NULL, it should have float3 position in the first 12 bytes of each vertex
|
||||
* target_partition_size is a target size for each partition, in clusters; the resulting partitions may be smaller or larger (up to target + target/3)
|
||||
*/
|
||||
MESHOPTIMIZER_API size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size);
|
||||
|
||||
|
|
@ -805,7 +810,7 @@ MESHOPTIMIZER_API unsigned short meshopt_quantizeHalf(float v);
|
|||
|
||||
/**
|
||||
* Quantize a float into a floating point value with a limited number of significant mantissa bits, preserving the IEEE-754 fp32 binary representation
|
||||
* Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
|
||||
* Preserves infinities/NaN, flushes denormals to zero, rounds to nearest
|
||||
* Assumes N is in a valid mantissa precision range, which is 1..23
|
||||
*/
|
||||
MESHOPTIMIZER_API float meshopt_quantizeFloat(float v, int N);
|
||||
|
|
@ -904,13 +909,15 @@ inline size_t meshopt_simplifyWithUpdate(T* indices, size_t index_count, float*
|
|||
template <typename T>
|
||||
inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error = NULL);
|
||||
template <typename T>
|
||||
inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error = NULL);
|
||||
template <typename T>
|
||||
inline size_t meshopt_simplifyPrune(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error);
|
||||
template <typename T>
|
||||
inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_count, size_t vertex_count, T restart_index);
|
||||
template <typename T>
|
||||
inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_count, T restart_index);
|
||||
template <typename T>
|
||||
inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int buffer_size);
|
||||
inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size);
|
||||
template <typename T>
|
||||
inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
|
||||
template <typename T>
|
||||
|
|
@ -1288,6 +1295,15 @@ inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t in
|
|||
return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, NULL, target_index_count, target_error, result_error);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const unsigned char* vertex_lock, size_t target_index_count, float target_error, float* result_error)
|
||||
{
|
||||
meshopt_IndexAdapter<T> in(NULL, indices, index_count);
|
||||
meshopt_IndexAdapter<T> out(destination, NULL, index_count);
|
||||
|
||||
return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, vertex_lock, target_index_count, target_error, result_error);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline size_t meshopt_simplifyPrune(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float target_error)
|
||||
{
|
||||
|
|
@ -1316,11 +1332,11 @@ inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_
|
|||
}
|
||||
|
||||
template <typename T>
|
||||
inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int buffer_size)
|
||||
inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size)
|
||||
{
|
||||
meshopt_IndexAdapter<T> in(NULL, indices, index_count);
|
||||
|
||||
return meshopt_analyzeVertexCache(in.data, index_count, vertex_count, cache_size, warp_size, buffer_size);
|
||||
return meshopt_analyzeVertexCache(in.data, index_count, vertex_count, cache_size, warp_size, primgroup_size);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
|
|
|||
|
|
@ -10,6 +10,9 @@
|
|||
namespace meshopt
|
||||
{
|
||||
|
||||
// To avoid excessive recursion for malformed inputs, we switch to bisection after some depth
|
||||
const int kMergeDepthCutoff = 40;
|
||||
|
||||
struct ClusterAdjacency
|
||||
{
|
||||
unsigned int* offsets;
|
||||
|
|
@ -52,49 +55,44 @@ static void filterClusterIndices(unsigned int* data, unsigned int* offsets, cons
|
|||
offsets[cluster_count] = unsigned(cluster_write);
|
||||
}
|
||||
|
||||
static void computeClusterBounds(float* cluster_bounds, const unsigned int* cluster_indices, const unsigned int* cluster_offsets, size_t cluster_count, const float* vertex_positions, size_t vertex_positions_stride)
|
||||
static float computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_positions_stride, float* out_center)
|
||||
{
|
||||
size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
|
||||
|
||||
for (size_t i = 0; i < cluster_count; ++i)
|
||||
float center[3] = {0, 0, 0};
|
||||
|
||||
// approximate center of the cluster by averaging all vertex positions
|
||||
for (size_t j = 0; j < index_count; ++j)
|
||||
{
|
||||
float center[3] = {0, 0, 0};
|
||||
const float* p = vertex_positions + indices[j] * vertex_stride_float;
|
||||
|
||||
// approximate center of the cluster by averaging all vertex positions
|
||||
for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
|
||||
{
|
||||
const float* p = vertex_positions + cluster_indices[j] * vertex_stride_float;
|
||||
|
||||
center[0] += p[0];
|
||||
center[1] += p[1];
|
||||
center[2] += p[2];
|
||||
}
|
||||
|
||||
// note: technically clusters can't be empty per meshopt_partitionCluster but we check for a division by zero in case that changes
|
||||
if (size_t cluster_size = cluster_offsets[i + 1] - cluster_offsets[i])
|
||||
{
|
||||
center[0] /= float(cluster_size);
|
||||
center[1] /= float(cluster_size);
|
||||
center[2] /= float(cluster_size);
|
||||
}
|
||||
|
||||
// compute radius of the bounding sphere for each cluster
|
||||
float radiussq = 0;
|
||||
|
||||
for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
|
||||
{
|
||||
const float* p = vertex_positions + cluster_indices[j] * vertex_stride_float;
|
||||
|
||||
float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]);
|
||||
|
||||
radiussq = radiussq < d2 ? d2 : radiussq;
|
||||
}
|
||||
|
||||
cluster_bounds[i * 4 + 0] = center[0];
|
||||
cluster_bounds[i * 4 + 1] = center[1];
|
||||
cluster_bounds[i * 4 + 2] = center[2];
|
||||
cluster_bounds[i * 4 + 3] = sqrtf(radiussq);
|
||||
center[0] += p[0];
|
||||
center[1] += p[1];
|
||||
center[2] += p[2];
|
||||
}
|
||||
|
||||
// note: technically clusters can't be empty per meshopt_partitionCluster but we check for a division by zero in case that changes
|
||||
if (index_count)
|
||||
{
|
||||
center[0] /= float(index_count);
|
||||
center[1] /= float(index_count);
|
||||
center[2] /= float(index_count);
|
||||
}
|
||||
|
||||
// compute radius of the bounding sphere for each cluster
|
||||
float radiussq = 0;
|
||||
|
||||
for (size_t j = 0; j < index_count; ++j)
|
||||
{
|
||||
const float* p = vertex_positions + indices[j] * vertex_stride_float;
|
||||
|
||||
float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]);
|
||||
|
||||
radiussq = radiussq < d2 ? d2 : radiussq;
|
||||
}
|
||||
|
||||
memcpy(out_center, center, sizeof(center));
|
||||
return sqrtf(radiussq);
|
||||
}
|
||||
|
||||
static void buildClusterAdjacency(ClusterAdjacency& adjacency, const unsigned int* cluster_indices, const unsigned int* cluster_offsets, size_t cluster_count, size_t vertex_count, meshopt_Allocator& allocator)
|
||||
|
|
@ -211,6 +209,9 @@ struct ClusterGroup
|
|||
int next;
|
||||
unsigned int size; // 0 unless root
|
||||
unsigned int vertices;
|
||||
|
||||
float center[3];
|
||||
float radius;
|
||||
};
|
||||
|
||||
struct GroupOrder
|
||||
|
|
@ -285,15 +286,18 @@ static unsigned int countShared(const ClusterGroup* groups, int group1, int grou
|
|||
return total;
|
||||
}
|
||||
|
||||
static void mergeBounds(float* target, const float* source)
|
||||
static void mergeBounds(ClusterGroup& target, const ClusterGroup& source)
|
||||
{
|
||||
float r1 = target[3], r2 = source[3];
|
||||
float dx = source[0] - target[0], dy = source[1] - target[1], dz = source[2] - target[2];
|
||||
float r1 = target.radius, r2 = source.radius;
|
||||
float dx = source.center[0] - target.center[0], dy = source.center[1] - target.center[1], dz = source.center[2] - target.center[2];
|
||||
float d = sqrtf(dx * dx + dy * dy + dz * dz);
|
||||
|
||||
if (d + r1 < r2)
|
||||
{
|
||||
memcpy(target, source, 4 * sizeof(float));
|
||||
target.center[0] = source.center[0];
|
||||
target.center[1] = source.center[1];
|
||||
target.center[2] = source.center[2];
|
||||
target.radius = source.radius;
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -301,17 +305,17 @@ static void mergeBounds(float* target, const float* source)
|
|||
{
|
||||
float k = d > 0 ? (d + r2 - r1) / (2 * d) : 0.f;
|
||||
|
||||
target[0] += dx * k;
|
||||
target[1] += dy * k;
|
||||
target[2] += dz * k;
|
||||
target[3] = (d + r2 + r1) / 2;
|
||||
target.center[0] += dx * k;
|
||||
target.center[1] += dy * k;
|
||||
target.center[2] += dz * k;
|
||||
target.radius = (d + r2 + r1) / 2;
|
||||
}
|
||||
}
|
||||
|
||||
static float boundsScore(const float* target, const float* source)
|
||||
static float boundsScore(const ClusterGroup& target, const ClusterGroup& source)
|
||||
{
|
||||
float r1 = target[3], r2 = source[3];
|
||||
float dx = source[0] - target[0], dy = source[1] - target[1], dz = source[2] - target[2];
|
||||
float r1 = target.radius, r2 = source.radius;
|
||||
float dx = source.center[0] - target.center[0], dy = source.center[1] - target.center[1], dz = source.center[2] - target.center[2];
|
||||
float d = sqrtf(dx * dx + dy * dy + dz * dz);
|
||||
|
||||
float mr = d + r1 < r2 ? r2 : (d + r2 < r1 ? r1 : (d + r2 + r1) / 2);
|
||||
|
|
@ -319,7 +323,7 @@ static float boundsScore(const float* target, const float* source)
|
|||
return mr > 0 ? r1 / mr : 0.f;
|
||||
}
|
||||
|
||||
static int pickGroupToMerge(const ClusterGroup* groups, int id, const ClusterAdjacency& adjacency, size_t max_partition_size, const float* cluster_bounds)
|
||||
static int pickGroupToMerge(const ClusterGroup* groups, int id, const ClusterAdjacency& adjacency, size_t max_partition_size, bool use_bounds)
|
||||
{
|
||||
assert(groups[id].size > 0);
|
||||
|
||||
|
|
@ -347,8 +351,8 @@ static int pickGroupToMerge(const ClusterGroup* groups, int id, const ClusterAdj
|
|||
float score = float(int(shared)) * (group_rsqrt + other_rsqrt);
|
||||
|
||||
// incorporate spatial score to favor merging nearby groups
|
||||
if (cluster_bounds)
|
||||
score *= 1.f + 0.4f * boundsScore(&cluster_bounds[id * 4], &cluster_bounds[other * 4]);
|
||||
if (use_bounds)
|
||||
score *= 1.f + 0.4f * boundsScore(groups[id], groups[other]);
|
||||
|
||||
if (score > best_score)
|
||||
{
|
||||
|
|
@ -361,6 +365,120 @@ static int pickGroupToMerge(const ClusterGroup* groups, int id, const ClusterAdj
|
|||
return best_group;
|
||||
}
|
||||
|
||||
static void mergeLeaf(ClusterGroup* groups, unsigned int* order, size_t count, size_t target_partition_size, size_t max_partition_size)
|
||||
{
|
||||
for (size_t i = 0; i < count; ++i)
|
||||
{
|
||||
unsigned int id = order[i];
|
||||
if (groups[id].size == 0 || groups[id].size >= target_partition_size)
|
||||
continue;
|
||||
|
||||
float best_score = -1.f;
|
||||
int best_group = -1;
|
||||
|
||||
for (size_t j = 0; j < count; ++j)
|
||||
{
|
||||
unsigned int other = order[j];
|
||||
if (id == other || groups[other].size == 0)
|
||||
continue;
|
||||
|
||||
if (groups[id].size + groups[other].size > max_partition_size)
|
||||
continue;
|
||||
|
||||
// favor merging nearby groups
|
||||
float score = boundsScore(groups[id], groups[other]);
|
||||
|
||||
if (score > best_score)
|
||||
{
|
||||
best_score = score;
|
||||
best_group = other;
|
||||
}
|
||||
}
|
||||
|
||||
// merge id *into* best_group; that way, we may merge more groups into the same best_group, maximizing the chance of reaching target
|
||||
if (best_group != -1)
|
||||
{
|
||||
// combine groups by linking them together
|
||||
unsigned int tail = best_group;
|
||||
while (groups[tail].next >= 0)
|
||||
tail = groups[tail].next;
|
||||
|
||||
groups[tail].next = id;
|
||||
|
||||
// update group sizes; note, we omit vertices update for simplicity as it's not used for spatial merge
|
||||
groups[best_group].size += groups[id].size;
|
||||
groups[id].size = 0;
|
||||
|
||||
// merge bounding spheres
|
||||
mergeBounds(groups[best_group], groups[id]);
|
||||
groups[id].radius = 0.f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static size_t mergePartition(unsigned int* order, size_t count, const ClusterGroup* groups, int axis, float pivot)
|
||||
{
|
||||
size_t m = 0;
|
||||
|
||||
// invariant: elements in range [0, m) are < pivot, elements in range [m, i) are >= pivot
|
||||
for (size_t i = 0; i < count; ++i)
|
||||
{
|
||||
float v = groups[order[i]].center[axis];
|
||||
|
||||
// swap(m, i) unconditionally
|
||||
unsigned int t = order[m];
|
||||
order[m] = order[i];
|
||||
order[i] = t;
|
||||
|
||||
// when v >= pivot, we swap i with m without advancing it, preserving invariants
|
||||
m += v < pivot;
|
||||
}
|
||||
|
||||
return m;
|
||||
}
|
||||
|
||||
static void mergeSpatial(ClusterGroup* groups, unsigned int* order, size_t count, size_t target_partition_size, size_t max_partition_size, size_t leaf_size, int depth)
|
||||
{
|
||||
size_t total = 0;
|
||||
for (size_t i = 0; i < count; ++i)
|
||||
total += groups[order[i]].size;
|
||||
|
||||
if (total <= max_partition_size || count <= leaf_size)
|
||||
return mergeLeaf(groups, order, count, target_partition_size, max_partition_size);
|
||||
|
||||
float mean[3] = {};
|
||||
float vars[3] = {};
|
||||
float runc = 1, runs = 1;
|
||||
|
||||
// gather statistics on the points in the subtree using Welford's algorithm
|
||||
for (size_t i = 0; i < count; ++i, runc += 1.f, runs = 1.f / runc)
|
||||
{
|
||||
const float* point = groups[order[i]].center;
|
||||
|
||||
for (int k = 0; k < 3; ++k)
|
||||
{
|
||||
float delta = point[k] - mean[k];
|
||||
mean[k] += delta * runs;
|
||||
vars[k] += delta * (point[k] - mean[k]);
|
||||
}
|
||||
}
|
||||
|
||||
// split axis is one where the variance is largest
|
||||
int axis = (vars[0] >= vars[1] && vars[0] >= vars[2]) ? 0 : (vars[1] >= vars[2] ? 1 : 2);
|
||||
|
||||
float split = mean[axis];
|
||||
size_t middle = mergePartition(order, count, groups, axis, split);
|
||||
|
||||
// enforce balance for degenerate partitions
|
||||
// this also ensures recursion depth is bounded on pathological inputs
|
||||
if (middle <= leaf_size / 2 || count - middle <= leaf_size / 2 || depth >= kMergeDepthCutoff)
|
||||
middle = count / 2;
|
||||
|
||||
// recursion depth is logarithmic and bounded due to max depth check above
|
||||
mergeSpatial(groups, order, middle, target_partition_size, max_partition_size, leaf_size, depth + 1);
|
||||
mergeSpatial(groups, order + middle, count - middle, target_partition_size, max_partition_size, leaf_size, depth + 1);
|
||||
}
|
||||
|
||||
} // namespace meshopt
|
||||
|
||||
size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_partition_size)
|
||||
|
|
@ -371,7 +489,7 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int*
|
|||
assert(vertex_positions_stride % sizeof(float) == 0);
|
||||
assert(target_partition_size > 0);
|
||||
|
||||
size_t max_partition_size = target_partition_size + target_partition_size * 3 / 8;
|
||||
size_t max_partition_size = target_partition_size + target_partition_size / 3;
|
||||
|
||||
meshopt_Allocator allocator;
|
||||
|
||||
|
|
@ -385,20 +503,12 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int*
|
|||
filterClusterIndices(cluster_newindices, cluster_offsets, cluster_indices, cluster_index_counts, cluster_count, used, vertex_count, total_index_count);
|
||||
cluster_indices = cluster_newindices;
|
||||
|
||||
// compute bounding sphere for each cluster if positions are provided
|
||||
float* cluster_bounds = NULL;
|
||||
|
||||
if (vertex_positions)
|
||||
{
|
||||
cluster_bounds = allocator.allocate<float>(cluster_count * 4);
|
||||
computeClusterBounds(cluster_bounds, cluster_indices, cluster_offsets, cluster_count, vertex_positions, vertex_positions_stride);
|
||||
}
|
||||
|
||||
// build cluster adjacency along with edge weights (shared vertex count)
|
||||
ClusterAdjacency adjacency = {};
|
||||
buildClusterAdjacency(adjacency, cluster_indices, cluster_offsets, cluster_count, vertex_count, allocator);
|
||||
|
||||
ClusterGroup* groups = allocator.allocate<ClusterGroup>(cluster_count);
|
||||
memset(groups, 0, sizeof(ClusterGroup) * cluster_count);
|
||||
|
||||
GroupOrder* order = allocator.allocate<GroupOrder>(cluster_count);
|
||||
size_t pending = 0;
|
||||
|
|
@ -412,6 +522,10 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int*
|
|||
groups[i].vertices = cluster_offsets[i + 1] - cluster_offsets[i];
|
||||
assert(groups[i].vertices > 0);
|
||||
|
||||
// compute bounding sphere for each cluster if positions are provided
|
||||
if (vertex_positions)
|
||||
groups[i].radius = computeClusterBounds(cluster_indices + cluster_offsets[i], cluster_offsets[i + 1] - cluster_offsets[i], vertex_positions, vertex_positions_stride, groups[i].center);
|
||||
|
||||
GroupOrder item = {};
|
||||
item.id = unsigned(i);
|
||||
item.order = groups[i].vertices;
|
||||
|
|
@ -439,7 +553,7 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int*
|
|||
if (groups[top.id].size >= target_partition_size)
|
||||
continue;
|
||||
|
||||
int best_group = pickGroupToMerge(groups, top.id, adjacency, max_partition_size, cluster_bounds);
|
||||
int best_group = pickGroupToMerge(groups, top.id, adjacency, max_partition_size, /* use_bounds= */ vertex_positions);
|
||||
|
||||
// we can't grow the group any more, emit as is
|
||||
if (best_group == -1)
|
||||
|
|
@ -449,14 +563,11 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int*
|
|||
unsigned int shared = countShared(groups, top.id, best_group, adjacency);
|
||||
|
||||
// combine groups by linking them together
|
||||
assert(groups[best_group].size > 0);
|
||||
unsigned int tail = top.id;
|
||||
while (groups[tail].next >= 0)
|
||||
tail = groups[tail].next;
|
||||
|
||||
for (int i = top.id; i >= 0; i = groups[i].next)
|
||||
if (groups[i].next < 0)
|
||||
{
|
||||
groups[i].next = best_group;
|
||||
break;
|
||||
}
|
||||
groups[tail].next = best_group;
|
||||
|
||||
// update group sizes; note, the vertex update is a O(1) approximation which avoids recomputing the true size
|
||||
groups[top.id].size += groups[best_group].size;
|
||||
|
|
@ -467,10 +578,10 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int*
|
|||
groups[best_group].vertices = 0;
|
||||
|
||||
// merge bounding spheres if bounds are available
|
||||
if (cluster_bounds)
|
||||
if (vertex_positions)
|
||||
{
|
||||
mergeBounds(&cluster_bounds[top.id * 4], &cluster_bounds[best_group * 4]);
|
||||
memset(&cluster_bounds[best_group * 4], 0, 4 * sizeof(float));
|
||||
mergeBounds(groups[top.id], groups[best_group]);
|
||||
groups[best_group].radius = 0;
|
||||
}
|
||||
|
||||
// re-associate all clusters back to the merged group
|
||||
|
|
@ -481,6 +592,20 @@ size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int*
|
|||
heapPush(order, pending++, top);
|
||||
}
|
||||
|
||||
// if vertex positions are provided, we do a final pass to see if we can merge small groups based on spatial locality alone
|
||||
if (vertex_positions)
|
||||
{
|
||||
unsigned int* merge_order = reinterpret_cast<unsigned int*>(order);
|
||||
size_t merge_offset = 0;
|
||||
|
||||
for (size_t i = 0; i < cluster_count; ++i)
|
||||
if (groups[i].size)
|
||||
merge_order[merge_offset++] = unsigned(i);
|
||||
|
||||
mergeSpatial(groups, merge_order, merge_offset, target_partition_size, max_partition_size, /* leaf_size= */ 8, 0);
|
||||
}
|
||||
|
||||
// output each remaining group
|
||||
size_t next_group = 0;
|
||||
|
||||
for (size_t i = 0; i < cluster_count; ++i)
|
||||
|
|
|
|||
|
|
@ -243,14 +243,18 @@ static unsigned int* buildSparseRemap(unsigned int* indices, size_t index_count,
|
|||
{
|
||||
// use a bit set to compute the precise number of unique vertices
|
||||
unsigned char* filter = allocator.allocate<unsigned char>((vertex_count + 7) / 8);
|
||||
memset(filter, 0, (vertex_count + 7) / 8);
|
||||
|
||||
for (size_t i = 0; i < index_count; ++i)
|
||||
{
|
||||
unsigned int index = indices[i];
|
||||
assert(index < vertex_count);
|
||||
filter[index / 8] = 0;
|
||||
}
|
||||
|
||||
size_t unique = 0;
|
||||
for (size_t i = 0; i < index_count; ++i)
|
||||
{
|
||||
unsigned int index = indices[i];
|
||||
assert(index < vertex_count);
|
||||
|
||||
unique += (filter[index / 8] & (1 << (index % 8))) == 0;
|
||||
filter[index / 8] |= 1 << (index % 8);
|
||||
}
|
||||
|
|
@ -269,7 +273,6 @@ static unsigned int* buildSparseRemap(unsigned int* indices, size_t index_count,
|
|||
for (size_t i = 0; i < index_count; ++i)
|
||||
{
|
||||
unsigned int index = indices[i];
|
||||
|
||||
unsigned int* entry = hashLookup2(revremap, revremap_size, hasher, index, ~0u);
|
||||
|
||||
if (*entry == ~0u)
|
||||
|
|
@ -364,8 +367,8 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned
|
|||
memset(loopback, -1, vertex_count * sizeof(unsigned int));
|
||||
|
||||
// incoming & outgoing open edges: ~0u if no open edges, i if there are more than 1
|
||||
// note that this is the same data as required in loop[] arrays; loop[] data is only valid for border/seam
|
||||
// but here it's okay to fill the data out for other types of vertices as well
|
||||
// note that this is the same data as required in loop[] arrays; loop[] data is only used for border/seam by default
|
||||
// in permissive mode we also use it to guide complex-complex collapses, so we fill it for all vertices
|
||||
unsigned int* openinc = loopback;
|
||||
unsigned int* openout = loop;
|
||||
|
||||
|
|
@ -617,7 +620,7 @@ static void rescaleAttributes(float* result, const float* vertex_attributes_data
|
|||
}
|
||||
}
|
||||
|
||||
static void finalizeVertices(float* vertex_positions_data, size_t vertex_positions_stride, float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, size_t vertex_count, const Vector3* vertex_positions, const float* vertex_attributes, const unsigned int* sparse_remap, const unsigned int* attribute_remap, float vertex_scale, const float* vertex_offset, const unsigned char* vertex_update)
|
||||
static void finalizeVertices(float* vertex_positions_data, size_t vertex_positions_stride, float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, size_t vertex_count, const Vector3* vertex_positions, const float* vertex_attributes, const unsigned int* sparse_remap, const unsigned int* attribute_remap, float vertex_scale, const float* vertex_offset, const unsigned char* vertex_kind, const unsigned char* vertex_update, const unsigned char* vertex_lock)
|
||||
{
|
||||
size_t vertex_positions_stride_float = vertex_positions_stride / sizeof(float);
|
||||
size_t vertex_attributes_stride_float = vertex_attributes_stride / sizeof(float);
|
||||
|
|
@ -629,12 +632,20 @@ static void finalizeVertices(float* vertex_positions_data, size_t vertex_positio
|
|||
|
||||
unsigned int ri = sparse_remap ? sparse_remap[i] : unsigned(i);
|
||||
|
||||
const Vector3& p = vertex_positions[i];
|
||||
float* v = vertex_positions_data + ri * vertex_positions_stride_float;
|
||||
// updating externally locked vertices is not allowed
|
||||
if (vertex_lock && (vertex_lock[ri] & meshopt_SimplifyVertex_Lock) != 0)
|
||||
continue;
|
||||
|
||||
v[0] = p.x * vertex_scale + vertex_offset[0];
|
||||
v[1] = p.y * vertex_scale + vertex_offset[1];
|
||||
v[2] = p.z * vertex_scale + vertex_offset[2];
|
||||
// moving locked vertices may result in floating point drift
|
||||
if (vertex_kind[i] != Kind_Locked)
|
||||
{
|
||||
const Vector3& p = vertex_positions[i];
|
||||
float* v = vertex_positions_data + ri * vertex_positions_stride_float;
|
||||
|
||||
v[0] = p.x * vertex_scale + vertex_offset[0];
|
||||
v[1] = p.y * vertex_scale + vertex_offset[1];
|
||||
v[2] = p.z * vertex_scale + vertex_offset[2];
|
||||
}
|
||||
|
||||
if (attribute_count)
|
||||
{
|
||||
|
|
@ -1260,6 +1271,20 @@ static float getNeighborhoodRadius(const EdgeAdjacency& adjacency, const Vector3
|
|||
return sqrtf(result);
|
||||
}
|
||||
|
||||
static unsigned int getComplexTarget(unsigned int v, unsigned int target, const unsigned int* remap, const unsigned int* loop, const unsigned int* loopback)
|
||||
{
|
||||
unsigned int r = remap[target];
|
||||
|
||||
// use loop metadata to guide complex collapses towards the correct wedge
|
||||
// this works for edges on attribute discontinuities because loop/loopback track the single half-edge without a pair, similar to seams
|
||||
if (loop[v] != ~0u && remap[loop[v]] == r)
|
||||
return loop[v];
|
||||
else if (loopback[v] != ~0u && remap[loopback[v]] == r)
|
||||
return loopback[v];
|
||||
else
|
||||
return target;
|
||||
}
|
||||
|
||||
static size_t boundEdgeCollapses(const EdgeAdjacency& adjacency, size_t vertex_count, size_t index_count, unsigned char* vertex_kind)
|
||||
{
|
||||
size_t dual_count = 0;
|
||||
|
|
@ -1382,15 +1407,22 @@ static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const
|
|||
}
|
||||
else
|
||||
{
|
||||
// complex edges can have multiple wedges, so we need to aggregate errors for all wedges
|
||||
// this is different from seams (where we aggregate pairwise) because all wedges collapse onto the same target
|
||||
// complex edges can have multiple wedges, so we need to aggregate errors for all wedges based on the selected target
|
||||
if (vertex_kind[i0] == Kind_Complex)
|
||||
for (unsigned int v = wedge[i0]; v != i0; v = wedge[v])
|
||||
ei += quadricError(attribute_quadrics[v], &attribute_gradients[v * attribute_count], attribute_count, vertex_positions[i1], &vertex_attributes[i1 * attribute_count]);
|
||||
{
|
||||
unsigned int t = getComplexTarget(v, i1, remap, loop, loopback);
|
||||
|
||||
ei += quadricError(attribute_quadrics[v], &attribute_gradients[v * attribute_count], attribute_count, vertex_positions[t], &vertex_attributes[t * attribute_count]);
|
||||
}
|
||||
|
||||
if (vertex_kind[i1] == Kind_Complex && bidi)
|
||||
for (unsigned int v = wedge[i1]; v != i1; v = wedge[v])
|
||||
ej += quadricError(attribute_quadrics[v], &attribute_gradients[v * attribute_count], attribute_count, vertex_positions[i0], &vertex_attributes[i0 * attribute_count]);
|
||||
{
|
||||
unsigned int t = getComplexTarget(v, i0, remap, loop, loopback);
|
||||
|
||||
ej += quadricError(attribute_quadrics[v], &attribute_gradients[v * attribute_count], attribute_count, vertex_positions[t], &vertex_attributes[t * attribute_count]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1542,7 +1574,9 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char*
|
|||
|
||||
do
|
||||
{
|
||||
collapse_remap[v] = i1;
|
||||
unsigned int t = getComplexTarget(v, i1, remap, loop, loopback);
|
||||
|
||||
collapse_remap[v] = t;
|
||||
v = wedge[v];
|
||||
} while (v != i0);
|
||||
}
|
||||
|
|
@ -1634,10 +1668,10 @@ static void updateQuadrics(const unsigned int* collapse_remap, size_t vertex_cou
|
|||
}
|
||||
}
|
||||
|
||||
static void solveQuadrics(Vector3* vertex_positions, float* vertex_attributes, size_t vertex_count, const Quadric* vertex_quadrics, const QuadricGrad* volume_gradients, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap, const unsigned int* wedge, const EdgeAdjacency& adjacency, const unsigned char* vertex_kind, const unsigned char* vertex_update)
|
||||
static void solvePositions(Vector3* vertex_positions, size_t vertex_count, const Quadric* vertex_quadrics, const QuadricGrad* volume_gradients, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap, const unsigned int* wedge, const EdgeAdjacency& adjacency, const unsigned char* vertex_kind, const unsigned char* vertex_update)
|
||||
{
|
||||
#if TRACE
|
||||
size_t stats[5] = {};
|
||||
size_t stats[6] = {};
|
||||
#endif
|
||||
|
||||
for (size_t i = 0; i < vertex_count; ++i)
|
||||
|
|
@ -1645,7 +1679,6 @@ static void solveQuadrics(Vector3* vertex_positions, float* vertex_attributes, s
|
|||
if (!vertex_update[i])
|
||||
continue;
|
||||
|
||||
// moving externally locked vertices is prohibited
|
||||
// moving vertices on an attribute discontinuity may result in extrapolating UV outside of the chart bounds
|
||||
// moving vertices on a border requires a stronger edge quadric to preserve the border geometry
|
||||
if (vertex_kind[i] == Kind_Locked || vertex_kind[i] == Kind_Seam || vertex_kind[i] == Kind_Border)
|
||||
|
|
@ -1709,36 +1742,64 @@ static void solveQuadrics(Vector3* vertex_positions, float* vertex_attributes, s
|
|||
continue;
|
||||
}
|
||||
|
||||
// reject updates that increase positional error too much; allow some tolerance to improve attribute quality
|
||||
if (quadricError(vertex_quadrics[i], p) > quadricError(vertex_quadrics[i], vp) * 1.5f + 1e-6f)
|
||||
{
|
||||
TRACESTATS(5);
|
||||
continue;
|
||||
}
|
||||
|
||||
TRACESTATS(1);
|
||||
vertex_positions[i] = p;
|
||||
}
|
||||
|
||||
#if TRACE
|
||||
printf("updated %d/%d positions; failed solve %d bounds %d flip %d\n", int(stats[1]), int(stats[0]), int(stats[2]), int(stats[3]), int(stats[4]));
|
||||
printf("updated %d/%d positions; failed solve %d bounds %d flip %d error %d\n", int(stats[1]), int(stats[0]), int(stats[2]), int(stats[3]), int(stats[4]), int(stats[5]));
|
||||
#endif
|
||||
}
|
||||
|
||||
if (attribute_count == 0)
|
||||
return;
|
||||
|
||||
static void solveAttributes(Vector3* vertex_positions, float* vertex_attributes, size_t vertex_count, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, const unsigned char* vertex_update)
|
||||
{
|
||||
for (size_t i = 0; i < vertex_count; ++i)
|
||||
{
|
||||
if (!vertex_update[i])
|
||||
continue;
|
||||
|
||||
// updating externally locked vertices is prohibited
|
||||
if (vertex_kind[i] == Kind_Locked)
|
||||
if (remap[i] != i)
|
||||
continue;
|
||||
|
||||
const Vector3& p = vertex_positions[remap[i]];
|
||||
const Quadric& A = attribute_quadrics[i];
|
||||
|
||||
float iw = A.w == 0 ? 0.f : 1.f / A.w;
|
||||
|
||||
for (size_t k = 0; k < attribute_count; ++k)
|
||||
{
|
||||
const QuadricGrad& G = attribute_gradients[i * attribute_count + k];
|
||||
unsigned int shared = ~0u;
|
||||
|
||||
vertex_attributes[i * attribute_count + k] = (G.gx * p.x + G.gy * p.y + G.gz * p.z + G.gw) * iw;
|
||||
// for complex vertices, preserve attribute continuity and use highest weight wedge if values were shared
|
||||
if (vertex_kind[i] == Kind_Complex)
|
||||
{
|
||||
shared = unsigned(i);
|
||||
|
||||
for (unsigned int v = wedge[i]; v != i; v = wedge[v])
|
||||
if (vertex_attributes[v * attribute_count + k] != vertex_attributes[i * attribute_count + k])
|
||||
shared = ~0u;
|
||||
else if (shared != ~0u && attribute_quadrics[v].w > attribute_quadrics[shared].w)
|
||||
shared = v;
|
||||
}
|
||||
|
||||
// update attributes for all wedges
|
||||
unsigned int v = unsigned(i);
|
||||
do
|
||||
{
|
||||
unsigned int r = (shared == ~0u) ? v : shared;
|
||||
|
||||
const Vector3& p = vertex_positions[i]; // same for all wedges
|
||||
const Quadric& A = attribute_quadrics[r];
|
||||
const QuadricGrad& G = attribute_gradients[r * attribute_count + k];
|
||||
|
||||
float iw = A.w == 0 ? 0.f : 1.f / A.w;
|
||||
float av = (G.gx * p.x + G.gy * p.y + G.gz * p.z + G.gw) * iw;
|
||||
|
||||
vertex_attributes[v * attribute_count + k] = av;
|
||||
v = wedge[v];
|
||||
} while (v != i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -2264,7 +2325,7 @@ static float interpolate(float y, float x0, float y0, float x1, float y1, float
|
|||
// three point interpolation from "revenge of interpolation search" paper
|
||||
float num = (y1 - y) * (x1 - x2) * (x1 - x0) * (y2 - y0);
|
||||
float den = (y2 - y) * (x1 - x2) * (y0 - y1) + (y0 - y) * (x1 - x0) * (y1 - y2);
|
||||
return x1 + num / den;
|
||||
return x1 + (den == 0.f ? 0.f : num / den);
|
||||
}
|
||||
|
||||
} // namespace meshopt
|
||||
|
|
@ -2519,16 +2580,19 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
|
|||
{
|
||||
unsigned int v = result[i];
|
||||
|
||||
// recomputing externally locked vertices may result in floating point drift
|
||||
vertex_update[v] = vertex_kind[v] != Kind_Locked;
|
||||
// mark the vertex for finalizeVertices and root vertex for solve*
|
||||
vertex_update[remap[v]] = vertex_update[v] = 1;
|
||||
}
|
||||
|
||||
// edge adjacency may be stale as we haven't updated it after last series of edge collapses
|
||||
updateEdgeAdjacency(adjacency, result, result_count, vertex_count, remap);
|
||||
|
||||
solveQuadrics(vertex_positions, vertex_attributes, vertex_count, vertex_quadrics, volume_gradients, attribute_quadrics, attribute_gradients, attribute_count, remap, wedge, adjacency, vertex_kind, vertex_update);
|
||||
solvePositions(vertex_positions, vertex_count, vertex_quadrics, volume_gradients, attribute_quadrics, attribute_gradients, attribute_count, remap, wedge, adjacency, vertex_kind, vertex_update);
|
||||
|
||||
finalizeVertices(const_cast<float*>(vertex_positions_data), vertex_positions_stride, const_cast<float*>(vertex_attributes_data), vertex_attributes_stride, attribute_weights, attribute_count, vertex_count, vertex_positions, vertex_attributes, sparse_remap, attribute_remap, vertex_scale, vertex_offset, vertex_update);
|
||||
if (attribute_count)
|
||||
solveAttributes(vertex_positions, vertex_attributes, vertex_count, attribute_quadrics, attribute_gradients, attribute_count, remap, wedge, vertex_kind, vertex_update);
|
||||
|
||||
finalizeVertices(const_cast<float*>(vertex_positions_data), vertex_positions_stride, const_cast<float*>(vertex_attributes_data), vertex_attributes_stride, attribute_weights, attribute_count, vertex_count, vertex_positions, vertex_attributes, sparse_remap, attribute_remap, vertex_scale, vertex_offset, vertex_kind, vertex_update, vertex_lock);
|
||||
}
|
||||
|
||||
// if debug visualization data is requested, fill it instead of index data; for simplicity, this doesn't work with sparsity
|
||||
|
|
|
|||
|
|
@ -208,6 +208,7 @@ static void splitPoints(unsigned int* destination, unsigned int* orderx, unsigne
|
|||
partitionPoints(axis, temp, sides, split, count);
|
||||
}
|
||||
|
||||
// recursion depth is logarithmic and bounded as we always split in approximately half
|
||||
splitPoints(destination, orderx, ordery, orderz, keys, split, scratch, cluster_size);
|
||||
splitPoints(destination + split, orderx + split, ordery + split, orderz + split, keys, count - split, scratch, cluster_size);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -122,7 +122,7 @@ namespace meshopt
|
|||
|
||||
const unsigned char kVertexHeader = 0xa0;
|
||||
|
||||
static int gEncodeVertexVersion = 0;
|
||||
static int gEncodeVertexVersion = 1;
|
||||
const int kDecodeVertexVersion = 1;
|
||||
|
||||
const size_t kVertexBlockSizeBytes = 8192;
|
||||
|
|
|
|||
|
|
@ -109,28 +109,33 @@ static void decodeFilterOct(T* data, size_t count)
|
|||
|
||||
static void decodeFilterQuat(short* data, size_t count)
|
||||
{
|
||||
const float scale = 1.f / sqrtf(2.f);
|
||||
const float scale = 32767.f / sqrtf(2.f);
|
||||
|
||||
for (size_t i = 0; i < count; ++i)
|
||||
{
|
||||
// recover scale from the high byte of the component
|
||||
int sf = data[i * 4 + 3] | 3;
|
||||
float ss = scale / float(sf);
|
||||
float s = float(sf);
|
||||
|
||||
// convert x/y/z to [-1..1] (scaled...)
|
||||
float x = float(data[i * 4 + 0]) * ss;
|
||||
float y = float(data[i * 4 + 1]) * ss;
|
||||
float z = float(data[i * 4 + 2]) * ss;
|
||||
// convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf)
|
||||
float x = float(data[i * 4 + 0]);
|
||||
float y = float(data[i * 4 + 1]);
|
||||
float z = float(data[i * 4 + 2]);
|
||||
|
||||
// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
|
||||
float ww = 1.f - x * x - y * y - z * z;
|
||||
// reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors
|
||||
float ws = s * s;
|
||||
float ww = ws * 2.f - x * x - y * y - z * z;
|
||||
float w = sqrtf(ww >= 0.f ? ww : 0.f);
|
||||
|
||||
// compute final scale; note that all computations above are unscaled
|
||||
// we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range
|
||||
float ss = scale / s;
|
||||
|
||||
// rounded signed float->int
|
||||
int xf = int(x * 32767.f + (x >= 0.f ? 0.5f : -0.5f));
|
||||
int yf = int(y * 32767.f + (y >= 0.f ? 0.5f : -0.5f));
|
||||
int zf = int(z * 32767.f + (z >= 0.f ? 0.5f : -0.5f));
|
||||
int wf = int(w * 32767.f + 0.5f);
|
||||
int xf = int(x * ss + (x >= 0.f ? 0.5f : -0.5f));
|
||||
int yf = int(y * ss + (y >= 0.f ? 0.5f : -0.5f));
|
||||
int zf = int(z * ss + (z >= 0.f ? 0.5f : -0.5f));
|
||||
int wf = int(w * ss + 0.5f);
|
||||
|
||||
int qc = data[i * 4 + 3] & 3;
|
||||
|
||||
|
|
@ -347,7 +352,7 @@ static void decodeFilterOctSimd16(short* data, size_t count)
|
|||
|
||||
static void decodeFilterQuatSimd(short* data, size_t count)
|
||||
{
|
||||
const float scale = 1.f / sqrtf(2.f);
|
||||
const float scale = 32767.f / sqrtf(2.f);
|
||||
|
||||
for (size_t i = 0; i < count; i += 4)
|
||||
{
|
||||
|
|
@ -366,24 +371,27 @@ static void decodeFilterQuatSimd(short* data, size_t count)
|
|||
|
||||
// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
|
||||
__m128i sf = _mm_or_si128(cf, _mm_set1_epi32(3));
|
||||
__m128 ss = _mm_div_ps(_mm_set1_ps(scale), _mm_cvtepi32_ps(sf));
|
||||
__m128 s = _mm_cvtepi32_ps(sf);
|
||||
|
||||
// convert x/y/z to [-1..1] (scaled...)
|
||||
__m128 x = _mm_mul_ps(_mm_cvtepi32_ps(xf), ss);
|
||||
__m128 y = _mm_mul_ps(_mm_cvtepi32_ps(yf), ss);
|
||||
__m128 z = _mm_mul_ps(_mm_cvtepi32_ps(zf), ss);
|
||||
// convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf)
|
||||
__m128 x = _mm_cvtepi32_ps(xf);
|
||||
__m128 y = _mm_cvtepi32_ps(yf);
|
||||
__m128 z = _mm_cvtepi32_ps(zf);
|
||||
|
||||
// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
|
||||
__m128 ww = _mm_sub_ps(_mm_set1_ps(1.f), _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z))));
|
||||
// reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors
|
||||
__m128 ws = _mm_mul_ps(s, _mm_add_ps(s, s)); // s*2s instead of 2*(s*s) to work around clang bug with integer multiplication
|
||||
__m128 ww = _mm_sub_ps(ws, _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z))));
|
||||
__m128 w = _mm_sqrt_ps(_mm_max_ps(ww, _mm_setzero_ps()));
|
||||
|
||||
__m128 s = _mm_set1_ps(32767.f);
|
||||
// compute final scale; note that all computations above are unscaled
|
||||
// we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range
|
||||
__m128 ss = _mm_div_ps(_mm_set1_ps(scale), s);
|
||||
|
||||
// rounded signed float->int
|
||||
__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
|
||||
__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
|
||||
__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
|
||||
__m128i wr = _mm_cvtps_epi32(_mm_mul_ps(w, s));
|
||||
__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, ss));
|
||||
__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, ss));
|
||||
__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, ss));
|
||||
__m128i wr = _mm_cvtps_epi32(_mm_mul_ps(w, ss));
|
||||
|
||||
// mix x/z and w/y to make 16-bit unpack easier
|
||||
__m128i xzr = _mm_or_si128(_mm_and_si128(xr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(zr, 16));
|
||||
|
|
@ -542,6 +550,13 @@ inline float32x4_t vdivq_f32(float32x4_t x, float32x4_t y)
|
|||
r = vmulq_f32(r, vrecpsq_f32(y, r)); // refine rcp estimate
|
||||
return vmulq_f32(x, r);
|
||||
}
|
||||
|
||||
#ifndef __ARM_FEATURE_FMA
|
||||
inline float32x4_t vfmaq_f32(float32x4_t x, float32x4_t y, float32x4_t z)
|
||||
{
|
||||
return vaddq_f32(x, vmulq_f32(y, z));
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef SIMD_NEON
|
||||
|
|
@ -572,23 +587,21 @@ static void decodeFilterOctSimd8(signed char* data, size_t count)
|
|||
y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
|
||||
|
||||
// compute normal length & scale
|
||||
float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
|
||||
float32x4_t ll = vfmaq_f32(vfmaq_f32(vmulq_f32(x, x), y, y), z, z);
|
||||
float32x4_t rl = vrsqrteq_f32(ll);
|
||||
float32x4_t s = vmulq_f32(vdupq_n_f32(127.f), rl);
|
||||
|
||||
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
|
||||
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
|
||||
// note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
|
||||
const float32x4_t fsnap = vdupq_n_f32(3 << 22);
|
||||
|
||||
int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
|
||||
int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
|
||||
int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
|
||||
int32x4_t xr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, x, s));
|
||||
int32x4_t yr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, y, s));
|
||||
int32x4_t zr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, z, s));
|
||||
|
||||
// combine xr/yr/zr into final value
|
||||
int32x4_t res = vandq_s32(n4, vdupq_n_s32(0xff000000));
|
||||
res = vorrq_s32(res, vandq_s32(xr, vdupq_n_s32(0xff)));
|
||||
res = vorrq_s32(res, vshlq_n_s32(vandq_s32(yr, vdupq_n_s32(0xff)), 8));
|
||||
res = vorrq_s32(res, vshlq_n_s32(vandq_s32(zr, vdupq_n_s32(0xff)), 16));
|
||||
int32x4_t res = vsliq_n_s32(xr, vsliq_n_s32(yr, zr, 8), 8);
|
||||
res = vbslq_s32(vdupq_n_u32(0xff000000), n4, res);
|
||||
|
||||
vst1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]), res);
|
||||
}
|
||||
|
|
@ -626,21 +639,25 @@ static void decodeFilterOctSimd16(short* data, size_t count)
|
|||
y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
|
||||
|
||||
// compute normal length & scale
|
||||
float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
|
||||
float32x4_t ll = vfmaq_f32(vfmaq_f32(vmulq_f32(x, x), y, y), z, z);
|
||||
#if !defined(__aarch64__) && !defined(_M_ARM64)
|
||||
float32x4_t rl = vrsqrteq_f32(ll);
|
||||
rl = vmulq_f32(rl, vrsqrtsq_f32(vmulq_f32(rl, ll), rl)); // refine rsqrt estimate
|
||||
float32x4_t s = vmulq_f32(vdupq_n_f32(32767.f), rl);
|
||||
#else
|
||||
float32x4_t s = vdivq_f32(vdupq_n_f32(32767.f), vsqrtq_f32(ll));
|
||||
#endif
|
||||
|
||||
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
|
||||
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
|
||||
const float32x4_t fsnap = vdupq_n_f32(3 << 22);
|
||||
|
||||
int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
|
||||
int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
|
||||
int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
|
||||
int32x4_t xr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, x, s));
|
||||
int32x4_t yr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, y, s));
|
||||
int32x4_t zr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, z, s));
|
||||
|
||||
// mix x/z and y/0 to make 16-bit unpack easier
|
||||
int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
|
||||
int32x4_t xzr = vsliq_n_s32(xr, zr, 16);
|
||||
int32x4_t y0r = vandq_s32(yr, vdupq_n_s32(0xffff));
|
||||
|
||||
// pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
|
||||
|
|
@ -658,7 +675,7 @@ static void decodeFilterOctSimd16(short* data, size_t count)
|
|||
|
||||
static void decodeFilterQuatSimd(short* data, size_t count)
|
||||
{
|
||||
const float scale = 1.f / sqrtf(2.f);
|
||||
const float scale = 32767.f / sqrtf(2.f);
|
||||
|
||||
for (size_t i = 0; i < count; i += 4)
|
||||
{
|
||||
|
|
@ -677,43 +694,52 @@ static void decodeFilterQuatSimd(short* data, size_t count)
|
|||
|
||||
// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
|
||||
int32x4_t sf = vorrq_s32(cf, vdupq_n_s32(3));
|
||||
float32x4_t ss = vdivq_f32(vdupq_n_f32(scale), vcvtq_f32_s32(sf));
|
||||
float32x4_t s = vcvtq_f32_s32(sf);
|
||||
|
||||
// convert x/y/z to [-1..1] (scaled...)
|
||||
float32x4_t x = vmulq_f32(vcvtq_f32_s32(xf), ss);
|
||||
float32x4_t y = vmulq_f32(vcvtq_f32_s32(yf), ss);
|
||||
float32x4_t z = vmulq_f32(vcvtq_f32_s32(zf), ss);
|
||||
// convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf)
|
||||
float32x4_t x = vcvtq_f32_s32(xf);
|
||||
float32x4_t y = vcvtq_f32_s32(yf);
|
||||
float32x4_t z = vcvtq_f32_s32(zf);
|
||||
|
||||
// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
|
||||
float32x4_t ww = vsubq_f32(vdupq_n_f32(1.f), vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z))));
|
||||
// reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors
|
||||
float32x4_t ws = vmulq_f32(s, s);
|
||||
float32x4_t ww = vsubq_f32(vaddq_f32(ws, ws), vfmaq_f32(vfmaq_f32(vmulq_f32(x, x), y, y), z, z));
|
||||
float32x4_t w = vsqrtq_f32(vmaxq_f32(ww, vdupq_n_f32(0.f)));
|
||||
|
||||
float32x4_t s = vdupq_n_f32(32767.f);
|
||||
// compute final scale; note that all computations above are unscaled
|
||||
// we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range
|
||||
float32x4_t ss = vdivq_f32(vdupq_n_f32(scale), s);
|
||||
|
||||
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
|
||||
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
|
||||
const float32x4_t fsnap = vdupq_n_f32(3 << 22);
|
||||
|
||||
int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
|
||||
int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
|
||||
int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
|
||||
int32x4_t wr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(w, s), fsnap));
|
||||
int32x4_t xr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, x, ss));
|
||||
int32x4_t yr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, y, ss));
|
||||
int32x4_t zr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, z, ss));
|
||||
int32x4_t wr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, w, ss));
|
||||
|
||||
// mix x/z and w/y to make 16-bit unpack easier
|
||||
int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
|
||||
int32x4_t wyr = vorrq_s32(vandq_s32(wr, vdupq_n_s32(0xffff)), vshlq_n_s32(yr, 16));
|
||||
int32x4_t xzr = vsliq_n_s32(xr, zr, 16);
|
||||
int32x4_t wyr = vsliq_n_s32(wr, yr, 16);
|
||||
|
||||
// pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
|
||||
int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[0]);
|
||||
int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[1]);
|
||||
uint64x2_t res_0 = vreinterpretq_u64_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[0]);
|
||||
uint64x2_t res_1 = vreinterpretq_u64_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[1]);
|
||||
|
||||
// store results to stack so that we can rotate using scalar instructions
|
||||
// TODO: volatile works around LLVM mis-optimizing code; https://github.com/llvm/llvm-project/issues/166808
|
||||
volatile uint64_t res[4];
|
||||
vst1q_u64(const_cast<uint64_t*>(&res[0]), res_0);
|
||||
vst1q_u64(const_cast<uint64_t*>(&res[2]), res_1);
|
||||
|
||||
// rotate and store
|
||||
uint64_t* out = (uint64_t*)&data[i * 4];
|
||||
uint64_t* out = reinterpret_cast<uint64_t*>(&data[i * 4]);
|
||||
|
||||
out[0] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 0), vgetq_lane_s32(cf, 0) << 4);
|
||||
out[1] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 1), vgetq_lane_s32(cf, 1) << 4);
|
||||
out[2] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 0), vgetq_lane_s32(cf, 2) << 4);
|
||||
out[3] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 1), vgetq_lane_s32(cf, 3) << 4);
|
||||
out[0] = rotateleft64(res[0], data[(i + 0) * 4 + 3] << 4);
|
||||
out[1] = rotateleft64(res[1], data[(i + 1) * 4 + 3] << 4);
|
||||
out[2] = rotateleft64(res[2], data[(i + 2) * 4 + 3] << 4);
|
||||
out[3] = rotateleft64(res[3], data[(i + 3) * 4 + 3] << 4);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -767,19 +793,16 @@ static void decodeFilterColorSimd8(unsigned char* data, size_t count)
|
|||
int32x4_t bf = vsubq_s32(yf, vaddq_s32(cof, cgf));
|
||||
|
||||
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
|
||||
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
|
||||
// note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
|
||||
const float32x4_t fsnap = vdupq_n_f32(3 << 22);
|
||||
|
||||
int32x4_t rr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(rf), ss), fsnap));
|
||||
int32x4_t gr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(gf), ss), fsnap));
|
||||
int32x4_t br = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(bf), ss), fsnap));
|
||||
int32x4_t ar = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(af), ss), fsnap));
|
||||
int32x4_t rr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(rf), ss));
|
||||
int32x4_t gr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(gf), ss));
|
||||
int32x4_t br = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(bf), ss));
|
||||
int32x4_t ar = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(af), ss));
|
||||
|
||||
// repack rgba into final value
|
||||
int32x4_t res = vandq_s32(rr, vdupq_n_s32(0xff));
|
||||
res = vorrq_s32(res, vshlq_n_s32(vandq_s32(gr, vdupq_n_s32(0xff)), 8));
|
||||
res = vorrq_s32(res, vshlq_n_s32(vandq_s32(br, vdupq_n_s32(0xff)), 16));
|
||||
res = vorrq_s32(res, vshlq_n_s32(ar, 24));
|
||||
int32x4_t res = vsliq_n_s32(rr, vsliq_n_s32(gr, vsliq_n_s32(br, ar, 8), 8), 8);
|
||||
|
||||
vst1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]), res);
|
||||
}
|
||||
|
|
@ -824,14 +847,14 @@ static void decodeFilterColorSimd16(unsigned short* data, size_t count)
|
|||
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
|
||||
const float32x4_t fsnap = vdupq_n_f32(3 << 22);
|
||||
|
||||
int32x4_t rr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(rf), ss), fsnap));
|
||||
int32x4_t gr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(gf), ss), fsnap));
|
||||
int32x4_t br = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(bf), ss), fsnap));
|
||||
int32x4_t ar = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(vcvtq_f32_s32(af), ss), fsnap));
|
||||
int32x4_t rr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(rf), ss));
|
||||
int32x4_t gr = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(gf), ss));
|
||||
int32x4_t br = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(bf), ss));
|
||||
int32x4_t ar = vreinterpretq_s32_f32(vfmaq_f32(fsnap, vcvtq_f32_s32(af), ss));
|
||||
|
||||
// mix r/b and g/a to make 16-bit unpack easier
|
||||
int32x4_t rbr = vorrq_s32(vandq_s32(rr, vdupq_n_s32(0xffff)), vshlq_n_s32(br, 16));
|
||||
int32x4_t gar = vorrq_s32(vandq_s32(gr, vdupq_n_s32(0xffff)), vshlq_n_s32(ar, 16));
|
||||
int32x4_t rbr = vsliq_n_s32(rr, br, 16);
|
||||
int32x4_t gar = vsliq_n_s32(gr, ar, 16);
|
||||
|
||||
// pack r/g/b/a using 16-bit unpacks
|
||||
int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(rbr), vreinterpretq_s16_s32(gar)).val[0]);
|
||||
|
|
@ -958,7 +981,7 @@ static void decodeFilterOctSimd16(short* data, size_t count)
|
|||
|
||||
static void decodeFilterQuatSimd(short* data, size_t count)
|
||||
{
|
||||
const float scale = 1.f / sqrtf(2.f);
|
||||
const float scale = 32767.f / sqrtf(2.f);
|
||||
|
||||
for (size_t i = 0; i < count; i += 4)
|
||||
{
|
||||
|
|
@ -977,28 +1000,31 @@ static void decodeFilterQuatSimd(short* data, size_t count)
|
|||
|
||||
// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
|
||||
v128_t sf = wasm_v128_or(cf, wasm_i32x4_splat(3));
|
||||
v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(scale), wasm_f32x4_convert_i32x4(sf));
|
||||
v128_t s = wasm_f32x4_convert_i32x4(sf);
|
||||
|
||||
// convert x/y/z to [-1..1] (scaled...)
|
||||
v128_t x = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(xf), ss);
|
||||
v128_t y = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(yf), ss);
|
||||
v128_t z = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(zf), ss);
|
||||
// convert x/y/z to floating point (unscaled! implied scale of 1/sqrt(2.f) * 1/sf)
|
||||
v128_t x = wasm_f32x4_convert_i32x4(xf);
|
||||
v128_t y = wasm_f32x4_convert_i32x4(yf);
|
||||
v128_t z = wasm_f32x4_convert_i32x4(zf);
|
||||
|
||||
// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
|
||||
// reconstruct w as a square root (unscaled); we clamp to 0.f to avoid NaN due to precision errors
|
||||
// note: i32x4_max with 0 is equivalent to f32x4_max
|
||||
v128_t ww = wasm_f32x4_sub(wasm_f32x4_splat(1.f), wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z))));
|
||||
v128_t ws = wasm_f32x4_mul(s, s);
|
||||
v128_t ww = wasm_f32x4_sub(wasm_f32x4_add(ws, ws), wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z))));
|
||||
v128_t w = wasm_f32x4_sqrt(wasm_i32x4_max(ww, wasm_i32x4_splat(0)));
|
||||
|
||||
v128_t s = wasm_f32x4_splat(32767.f);
|
||||
// compute final scale; note that all computations above are unscaled
|
||||
// we need to divide by sf to get out of fixed point, divide by sqrt(2) to renormalize and multiply by 32767 to get to int16 range
|
||||
v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(scale), s);
|
||||
|
||||
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
|
||||
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
|
||||
const v128_t fsnap = wasm_f32x4_splat(3 << 22);
|
||||
|
||||
v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
|
||||
v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
|
||||
v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);
|
||||
v128_t wr = wasm_f32x4_add(wasm_f32x4_mul(w, s), fsnap);
|
||||
v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, ss), fsnap);
|
||||
v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, ss), fsnap);
|
||||
v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, ss), fsnap);
|
||||
v128_t wr = wasm_f32x4_add(wasm_f32x4_mul(w, ss), fsnap);
|
||||
|
||||
// mix x/z and w/y to make 16-bit unpack easier
|
||||
v128_t xzr = wasm_v128_or(wasm_v128_and(xr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(zr, 16));
|
||||
|
|
@ -1131,7 +1157,7 @@ static void decodeFilterColorSimd16(unsigned short* data, size_t count)
|
|||
v128_t bf = wasm_i32x4_sub(yf, wasm_i32x4_add(cof, cgf));
|
||||
|
||||
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
|
||||
// note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
|
||||
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
|
||||
const v128_t fsnap = wasm_f32x4_splat(3 << 22);
|
||||
|
||||
v128_t rr = wasm_f32x4_add(wasm_f32x4_mul(wasm_f32x4_convert_i32x4(rf), ss), fsnap);
|
||||
|
|
@ -1250,7 +1276,7 @@ void meshopt_decodeFilterColor(void* buffer, size_t count, size_t stride)
|
|||
void meshopt_encodeFilterOct(void* destination, size_t count, size_t stride, int bits, const float* data)
|
||||
{
|
||||
assert(stride == 4 || stride == 8);
|
||||
assert(bits >= 1 && bits <= 16);
|
||||
assert(bits >= 2 && bits <= 16);
|
||||
|
||||
signed char* d8 = static_cast<signed char*>(destination);
|
||||
short* d16 = static_cast<short*>(destination);
|
||||
|
|
|
|||
Loading…
Reference in New Issue