- Home /
How could I optimize this terrain generation?
Hi All,
Out of curiosity I was reading the documentation about IJobParallelFor and decided to try updating the terrain realtime with a generated perlin noise texture. I've tweaked the code quite a bit, and it shows good 30FPS but I feel it could be improved somehow. Any thoughts?
Below I'm sharing a video and the results of the logs in the code:
Job elapsed time: 12
List item
Loop elapsed time: 17
Time elapsed for adding up layers: 30ms (both above)
SetHeights ellapsed time: 58
https://www.youtube.com/watch?v=OteB1mUGEqE
using System.Diagnostics;
using Unity.Jobs;
using UnityEngine;
using Unity.Collections;
using Debug = UnityEngine.Debug;
using Unity.Mathematics;
using Unbegames.Noise;
using System.Collections;
using Unity.Burst;
using System.Threading.Tasks;
public class TerrainGenerator : MonoBehaviour
{
public float speed = 1f;
public float translationX = 0f;
public float translationY = 0f;
public float layer1HeightMultiplier = 3;
public float layer2HeightMultiplier = 3;
public float layer1ScaleMultiplier = 0.5f;
public float layer2ScaleMultiplier = 3f;
public float scale = 0.5f;
private Terrain terrain;
private Coroutine _coroutine;
// Start is called before the first frame update
void Start()
{
_coroutine = StartCoroutine(DrawTerrain());
}
private IEnumerator DrawTerrain()
{
terrain = GetComponent<Terrain>();
var detailWidth = terrain.terrainData.detailWidth;
var detailHeight = terrain.terrainData.detailHeight;
while (true)
{
var heights = GenerateTerrain(detailWidth, detailHeight);
var sw = new Stopwatch();
sw.Start();
//terrain.terrainData.SetHeightsDelayLOD(0, 0, heights);
//terrain.terrainData.SyncHeightmap();
terrain.terrainData.SetHeights(0, 0, heights);
sw.Stop();
Debug.Log($"SetHeights ellapsed time: {sw.ElapsedMilliseconds}");
translationY += Time.deltaTime * speed;
translationX += Time.deltaTime * speed;
yield return new WaitForSeconds(0.10f);
}
}
private void OnDestroy()
{
StopCoroutine(_coroutine);
}
private float[,] GenerateTerrain(int detailWidth, int detailHeight)
{
var sw = new Stopwatch();
sw.Reset();
sw.Start();
var heightsLayer1 = layer(detailWidth, detailHeight, layer1ScaleMultiplier, layer1HeightMultiplier);
sw.Stop();
Debug.Log($"Time elapsed for adding up layers: {sw.ElapsedMilliseconds}ms");
return heightsLayer1;
}
private float[,] layer(int detailWidth, int detailHeight, float scaleMultiplier, float heightMultiplier)
{
var sw = new Stopwatch();
sw.Start();
float[,] result = new float[detailWidth, detailHeight];
var heights = new NativeArray<float3>(detailHeight * detailWidth, Allocator.Persistent);
var modifiers = new NativeArray<Vector2>(1, Allocator.Persistent);
modifiers[0] = new Vector2(scale, heightMultiplier);
var dimensions = new NativeArray<Vector2>(1, Allocator.Persistent);
dimensions[0] = new Vector2(detailWidth, detailHeight);
var translation = new NativeArray<Vector2>(1, Allocator.Persistent);
translation[0] = new Vector2(translationX, translationY);
var job = new LayerJob
{
dimensions = dimensions,
heights = heights,
modifiers = modifiers,
translation = translation
};
int size = heights.Length;
JobHandle jobHandle = job.Schedule(size, 32);
jobHandle.Complete();
sw.Stop();
Debug.Log($"Job elapsed time: {sw.ElapsedMilliseconds}");
sw.Reset();
sw.Start();
for (var i = 0; i < size; i++)
{
result[(int)heights[i].x, (int)heights[i].y] = heights[i].z;
}
sw.Stop();
Debug.Log($"Loop elapsed time: {sw.ElapsedMilliseconds}");
//foreach (var height in heights)
//{
// result[(int)height.x, (int)height.y] = height.z;
//};
heights.Dispose();
dimensions.Dispose();
translation.Dispose();
modifiers.Dispose();
return result;
}
[BurstCompile]
struct LayerJob : IJobParallelFor
{
[ReadOnly]
public NativeArray<Vector2> dimensions;
[ReadOnly]
public NativeArray<Vector2> modifiers;
[ReadOnly]
internal NativeArray<Vector2> translation;
public NativeArray<float3> heights;
public void Execute(int index)
{
var detailWidth = dimensions[0].x;
var detailHeight = dimensions[0].y;
var scale = modifiers[0].x;
var heightMultiplier = modifiers[0].y;
var translationX = translation[0].x;
var translationY = translation[0].y;
var x = (int)Mathf.Floor(index % detailWidth);
var y = (int)Mathf.Floor(index / detailWidth);
//var perlin = new Perlin3D();
//heights[index] = new float3(x, y, perlin.GetValue(0, new float3(x * scale + translationX, y * scale + translationY, 0)) * heightMultiplier);
heights[index] = new float3(x, y, noise.srnoise(new float2(x * scale + translationX, y * scale)) * heightMultiplier);
//heights[index] = new float3(x, y, Mathf.PerlinNoise(x * scale + translationX, y * scale + translationY) * heightMultiplier);
}
}
}
Answer by andrew-lukasik · Mar 10 at 06:10 PM
Ignore absolute time (old i3 cpu) but note that
terrainData.SetHeights
becomes most of the cpu cost here. This is good news because it means our code is efficient.SetHeights
is slow^2 because it forcesTerrain
to recalculate everything at once.
This is how to do that:
using System.Collections;
using UnityEngine;
using UnityEngine.Assertions;
using Unity.Mathematics;
using Unity.Jobs;
using Unity.Collections;
using Unity.Collections.LowLevel.Unsafe;
using Unity.Profiling;
using BurstCompile = Unity.Burst.BurstCompileAttribute;
public class TerrainGenerator : MonoBehaviour
{
[SerializeField] Terrain _terrain = null;
[SerializeField] float _speed = 1f;
[SerializeField] float2 _translation = 0f;
[SerializeField] float _heightMultiplier = 0.1f;
[SerializeField] float _noiseScale = 5f;
NativeArray<float> _heightsNative;
float[,] _heights;
ProfilerMarker ___tick = new ProfilerMarker("tick");
ProfilerMarker ___set_heights = new ProfilerMarker("set_heights");
ProfilerMarker ___generate_layer = new ProfilerMarker("generate_layer");
ProfilerMarker ___native_to_managed = new ProfilerMarker("native_to_managed");
IEnumerator Start ()
{
var terrainData = _terrain.terrainData;
var width = terrainData.detailWidth;
var height = terrainData.detailHeight;
_heightsNative = new NativeArray<float>( height * width , Allocator.Persistent );
_heights = new float[ width , height ];
var step = new WaitForSeconds( 0.1f );
while( true )
{
___tick.Begin();
___generate_layer.Begin();
new LayerJob
{
Width = width ,
Height = height ,
NoiseCoordScale = _noiseScale ,
HeightMultiplier = _heightMultiplier ,
Translation = _translation ,
Heightmap = _heightsNative ,
}.Schedule( _heightsNative.Length , 32 ).Complete();
___native_to_managed.Begin();
MemCpy( _heightsNative , _heights );
___native_to_managed.End();
___generate_layer.End();
___set_heights.Begin();
terrainData.SetHeights( 0 , 0 , _heights );
___set_heights.End();
_translation += Time.deltaTime * _speed;
___tick.End();
yield return step;
}
}
void OnDestroy ()
{
if( _heightsNative.IsCreated ) _heightsNative.Dispose();
}
unsafe void MemCpy <SRC,DST> ( NativeArray<SRC> src , DST[,] dst )
where SRC : unmanaged
where DST : struct
{
int srcSize = src.Length * UnsafeUtility.SizeOf<SRC>();
int dstSize = dst.Length * UnsafeUtility.SizeOf<DST>();
if( srcSize==dstSize )
{
void* srcPtr = NativeArrayUnsafeUtility.GetUnsafePtr( src );
void* dstPtr = UnsafeUtility.PinGCArrayAndGetDataAddress( dst , out ulong dstHandle );
UnsafeUtility.MemCpy( destination:dstPtr , source:srcPtr , size:srcSize );
UnsafeUtility.ReleaseGCObject( dstHandle );
}
else Debug.LogError( $"<b>src</b> ({srcSize}[b]) and <b>dst</b> ({dstSize}[b]) must be of equal size. MemCpy aborted." );
}
[BurstCompile]
struct LayerJob : IJobParallelFor
{
public float Width, Height, HeightMultiplier;
public float2 NoiseCoordScale, Translation;
[WriteOnly] public NativeArray<float> Heightmap;
void IJobParallelFor.Execute ( int index )
{
float tx = ( index % Width )/Width;
float ty = ( index / Width )/Height;
float2 pos = new float2(tx,ty) * NoiseCoordScale + Translation;
Heightmap[index] = noise.cnoise(pos) * HeightMultiplier;
}
}
}
Damn! That MemCpy blew my mind! In a way I'm happy as my job was efficient and really I was losing 20ms copying the result with a for loop. What a code! Also I'm really glad you showed me how to use the profiler, I was always debugging times of execution with Debug.Log messages! Would you mind telling me your reasoning about how to approach this code for optimizing it? I can see there's some float magic going on as you multiply float by float2 types and changed also the types used in the job. Thank you so much!
Thanks. You're welcome!
Allocating 3 x
NativeArray<Vector2>
of length 1 just to store 3 sets of 2xfloat solves nothing and decreases performance by reducing memory locality (reading values close to each other in address space)noise.srnoise
is considerably slower thannoise.cnoise
while producing similar resultsUnity.Mathematics
+Burst
introduces SIMD. SIMD is vector multiplication (etc) for the same price as multiplication of two floats.
Could you point me to a reference where I can study about SIMD? It seems quite powerful and useful
Your answer
Follow this Question
Related Questions
Lowered general performance with Threads 1 Answer
Getting a stack trace of a specific thread 1 Answer
Help with Optimizing Voxel Code? 1 Answer
Double clicking console output does not take me to the corresponding line in code. 1 Answer
Are conditional breakpoints when debugging in VS2017 working for you? 2 Answers