- Home /
Multithreading doesn’t improve performance
Hello,I’m doing a project where I need to make a big amount of calculations.In fact I need to repeat the same kind of calculation about 3 milions of time.This obviously is really painful for performance, so I try to improve performances getting this code multithreaded.The basic calculations have this aspect:
using UnityEngine;
using System.Collections;
public class NoThread : MonoBehaviour {
int i,j,k;
float[,,] matrix;
float deltaTime;
float oldTime;
float value;
public GameObject cube;
System.Random rnd;
//visualization variable
int w,h;
GUIStyle style;
Rect rect;
void Start() {
matrix = new float[100,100,100];
w = Screen.width; h = Screen.height;
style = new GUIStyle ();
style.alignment = TextAnchor.UpperLeft;
style.fontSize = h * 2 / 100;
style.normal.textColor = new Color (0.0f, 0.0f, 0.5f, 1.0f);
rect = new Rect (0, 0, w, h * 2 / 100);
rnd = new System.Random ();
}
void Update(){
oldTime = Time.realtimeSinceStartup;
for(i=0;i<100;i++) {
for(j=0;j<100;j++) {
for(k=0;k<100;k++) {
matrix [i, j, k] += rnd.Next(-1,2);
}
}
}
for(k=0;k<100;k++) {
for(j=0;j<100;j++) {
for(i=0;i<100;i++) {
matrix [i, j, k] += rnd.Next(-1,2);
}
}
}
for(i=0;i<100;i++) {
for(k=0;k<100;k++) {
for(j=0;j<100;j++) {
matrix [i, j, k] += rnd.Next(-1,2);
}
}
}
deltaTime = Time.realtimeSinceStartup - oldTime;
value = 0;
for(i=0;i<100;i++) {
for(k=0;k<100;k++) {
for(j=0;j<100;j++) {
value += matrix[i,j,k]/1000;
}
}
}
Debug.Log (value);
cube.GetComponent<Transform> ().localScale = new Vector3(value,value,value);
}
void OnGUI ()
{
float msec = deltaTime * 1000.0f;
float fps = 1.0f / deltaTime;
string text = string.Format ("Calculations performance : {0:0.0} ms ({1:0.} fps)", msec, fps);
GUI.Label (rect, text, style);
}
}
The first step to improve my code was create a different thread that makes calculations, because in this way the application can run smoothly and then in the main thread apply the results to the scene. So I had to divide the calculations from the part of apply due to the limitations of Unity and its not thread-safe feature. So my code becomes like this:
using UnityEngine;
using System.Collections;
using System.Threading;
public class SingleThread : MonoBehaviour
{
int i, j, k;
float[,,] matrix;
float deltaTime;
float oldTime;
float value;
public GameObject cube;
bool endCalc = false, running =true;
Thread t;
System.Random rnd;
//visualization variable
int w, h;
GUIStyle style;
Rect rect;
void Start ()
{
matrix = new float[100, 100, 100];
w = Screen.width;
h = Screen.height;
style = new GUIStyle ();
style.alignment = TextAnchor.UpperLeft;
style.fontSize = h * 2 / 100;
style.normal.textColor = new Color (0.0f, 0.0f, 0.5f, 1.0f);
rect = new Rect (0, 0, w, h * 2 / 100);
rnd = new System.Random ();
t = new Thread (threadUpdate);
t.Start ();
oldTime = Time.realtimeSinceStartup;
}
void Update ()
{
if (endCalc) {
deltaTime = Time.realtimeSinceStartup - oldTime;
oldTime = Time.realtimeSinceStartup;
value = 0;
for (i = 0; i < 100; i++) {
for (k = 0; k < 100; k++) {
for (j = 0; j < 100; j++) {
value += matrix [i, j, k] / 1000;
}
}
}
cube.GetComponent<Transform> ().localScale = new Vector3 (value, value, value);
endCalc = false;
}
}
void OnGUI ()
{
float msec = deltaTime * 1000.0f;
float fps = 1.0f / deltaTime;
string text = string.Format ("Calculations performance : {0:0.0} ms ({1:0.} fps)", msec, fps);
GUI.Label (rect, text, style);
}
void threadUpdate ()
{
while (running) {
if (!endCalc) {
for(i=0;i<100;i++) {
for(j=0;j<100;j++) {
for(k=0;k<100;k++) {
matrix [i, j, k] += rnd.Next(-1,2);
}
}
}
for(k=0;k<100;k++) {
for(j=0;j<100;j++) {
for(i=0;i<100;i++) {
matrix [i, j, k] += rnd.Next(-1,2);
}
}
}
for(i=0;i<100;i++) {
for(k=0;k<100;k++) {
for(j=0;j<100;j++) {
matrix [i, j, k] += rnd.Next(-1,2);
}
}
}
endCalc = true;
}
}
}
void OnApplicationQuit(){
running = false;
}
}
Then I think that could be better try to divide calculations on more thread so I create 6 thread and I divide the three for loop in 6 semi for loop.
using UnityEngine;
using System.Collections;
using System.Threading;
public class MultiThread : MonoBehaviour
{
float[,,] matrix;
float deltaTime;
float oldTime;
float value;
public GameObject cube;
bool endCalc = false, running =true;
Thread t;
Thread[] threads;
System.Random[] rnd;
bool firstPart;
bool[] calcResults;
//visualization variable
int w, h;
GUIStyle style;
Rect rect;
void Start ()
{
matrix = new float[100, 100, 100];
w = Screen.width;
h = Screen.height;
style = new GUIStyle ();
style.alignment = TextAnchor.UpperLeft;
style.fontSize = h * 2 / 100;
style.normal.textColor = new Color (0.0f, 0.0f, 0.5f, 1.0f);
rect = new Rect (0, 0, w, h * 2 / 100);
oldTime = Time.realtimeSinceStartup;
rnd = new System.Random [6];
calcResults = new bool[6];
for (int i = 0; i < 6; i++) {
calcResults [i] = true;
rnd [i] = new System.Random ();
}
t = new Thread (threadUpdate);
t.Start ();
threads = new Thread [6];
threads [0] = new Thread (threadUpdate0);
threads [1] = new Thread (threadUpdate1);
threads [2] = new Thread (threadUpdate2);
threads [3] = new Thread (threadUpdate3);
threads [4] = new Thread (threadUpdate4);
threads [5] = new Thread (threadUpdate5);
threads[0].Start ();
threads[1].Start ();
threads[2].Start ();
threads[3].Start ();
threads[4].Start ();
threads[5].Start ();
firstPart = false;
endCalc = false;
}
void Update ()
{
if (endCalc) {
deltaTime = Time.realtimeSinceStartup - oldTime;
oldTime = Time.realtimeSinceStartup;
value = 0;
for (int i = 0; i < 100; i++) {
for (int k = 0; k < 100; k++) {
for (int j = 0; j < 100; j++) {
value += matrix [i, j, k] / 1000;
}
}
}
cube.GetComponent<Transform> ().localScale = new Vector3 (value, value, value);
endCalc = false;
}
}
void OnGUI ()
{
float msec = deltaTime * 1000.0f;
float fps = 1.0f / deltaTime;
string text = string.Format ("Calculations performance : {0:0.0} ms ({1:0.} fps)", msec, fps);
GUI.Label (rect, text, style);
}
void threadUpdate ()
{
while (running) {
if (!endCalc) {
if (!firstPart) {
firstPart = true;
for (int i = 0; i < 6; i++)
calcResults [0] = false;
} else if (calcResults [0] && calcResults [1] && calcResults [2]&&
calcResults [3] && calcResults [4] && calcResults [5]) {
endCalc = true;
firstPart = false;
}
}
}
}
void threadUpdate0 ()
{
while (running) {
if (!calcResults[0]) {
for(int i=0;i<50;i++) {
for(int j=0;j<100;j++) {
for(int k=0;k<100;k++) {
matrix [i, j, k] += rnd[0].Next(-1,2);
}
}
}
calcResults[0] = true;
}
}
}
void threadUpdate1 ()
{
while (running) {
if (!calcResults[1]) {
for(int k=0;k<50;k++) {
for(int j=0;j<100;j++) {
for(int i=0;i<100;i++) {
matrix [i, j, k] += rnd[1].Next(-1,2);
}
}
}
calcResults[1] = true;
}
}
}
void threadUpdate2 ()
{
while (running) {
if (!calcResults[2]) {
for(int i=0;i<50;i++) {
for(int k=0;k<100;k++) {
for(int j=0;j<100;j++) {
matrix [i, j, k] += rnd[2].Next(-1,2);
}
}
}
calcResults[2] = true;
}
}
}
void threadUpdate3 ()
{
while (running) {
if (!calcResults[3]) {
for(int i=50;i<100;i++) {
for(int j=0;j<100;j++) {
for(int k=0;k<100;k++) {
matrix [i, j, k] += rnd[3].Next(-1,2);
}
}
}
calcResults[3] = true;
}
}
}
void threadUpdate4 ()
{
while (running) {
if (!calcResults[4]) {
for(int k=50;k<100;k++) {
for(int j=0;j<100;j++) {
for(int i=0;i<100;i++) {
matrix [i, j, k] += rnd[4].Next(-1,2);
}
}
}
calcResults[4] = true;
}
}
}
void threadUpdate5 ()
{
while (running) {
if (!calcResults[5]) {
for(int i=50;i<100;i++) {
for(int k=0;k<100;k++) {
for(int j=0;j<100;j++) {
matrix [i, j, k] += rnd[5].Next(-1,2);
}
}
}
calcResults[5] = true;
}
}
}
void OnApplicationQuit(){
running = false;
}
}
The problem is that the version multi thread doesn't improve the performance by 6, in fact the fps become only two time better than single thread version.Someone of you know why this happends?I see by profiler that I use more CPU power, but it looks like there is something that slowdown my code in multithread.I'm testing this code on PC, Mac and Ps4. You can find the complete project for Unity 5.2.2 at this link . Thanks to everyone for the attention
EDIT:
I create also a three thread version and it runs like six thread version
using UnityEngine;
using System.Collections;
using System.Threading;
public class ThreeThread : MonoBehaviour
{
float[,,] matrix;
float deltaTime;
float oldTime;
float value;
public GameObject cube;
bool endCalc = false, running =true;
Thread t;
Thread[] threads;
System.Random[] rnd;
bool firstPart;
bool[] calcResults;
//visualization variable
int w, h;
GUIStyle style;
Rect rect;
void Start ()
{
matrix = new float[100, 100, 100];
w = Screen.width;
h = Screen.height;
style = new GUIStyle ();
style.alignment = TextAnchor.UpperLeft;
style.fontSize = h * 2 / 100;
style.normal.textColor = new Color (0.0f, 0.0f, 0.5f, 1.0f);
rect = new Rect (0, 0, w, h * 2 / 100);
oldTime = Time.realtimeSinceStartup;
rnd = new System.Random [3];
calcResults = new bool[3];
for (int i = 0; i < 3; i++) {
calcResults [i] = true;
rnd [i] = new System.Random ();
}
t = new Thread (threadUpdate);
t.Start ();
threads = new Thread [3];
threads [0] = new Thread (threadUpdate0);
threads [1] = new Thread (threadUpdate1);
threads [2] = new Thread (threadUpdate2);
threads[0].Start ();
threads[1].Start ();
threads[2].Start ();
firstPart = false;
endCalc = false;
}
void Update ()
{
if (endCalc) {
deltaTime = Time.realtimeSinceStartup - oldTime;
oldTime = Time.realtimeSinceStartup;
value = 0;
for (int i = 0; i < 100; i++) {
for (int k = 0; k < 100; k++) {
for (int j = 0; j < 100; j++) {
value += matrix [i, j, k] / 1000;
}
}
}
cube.GetComponent<Transform> ().localScale = new Vector3 (value, value, value);
endCalc = false;
}
}
void OnGUI ()
{
float msec = deltaTime * 1000.0f;
float fps = 1.0f / deltaTime;
string text = string.Format ("Prestazioni calcoli : {0:0.0} ms ({1:0.} fps)", msec, fps);
GUI.Label (rect, text, style);
}
void threadUpdate ()
{
Debug.Log ("chiamato principale");
while (running) {
Debug.Log ("entra ciclo principale");
if (!endCalc) {
if (!firstPart) {
firstPart = true;
for (int i = 0; i < 3; i++)
calcResults [0] = false;
} else if (calcResults [0] && calcResults [1] && calcResults [2]) {
endCalc = true;
firstPart = false;
}
}
}
}
void threadUpdate0 ()
{
while (running) {
if (!calcResults[0]) {
Debug.Log ("chiamato");
for(int i=0;i<100;i++) {
for(int j=0;j<100;j++) {
for(int k=0;k<100;k++) {
matrix [i, j, k] += rnd[0].Next(-1,2);
}
}
}
calcResults[0] = true;
}
}
}
void threadUpdate1 ()
{
while (running) {
if (!calcResults[1]) {
for(int k=0;k<100;k++) {
for(int j=0;j<100;j++) {
for(int i=0;i<100;i++) {
matrix [i, j, k] += rnd[1].Next(-1,2);
}
}
}
calcResults[1] = true;
}
}
}
void threadUpdate2 ()
{
while (running) {
if (!calcResults[2]) {
for(int i=0;i<100;i++) {
for(int k=0;k<100;k++) {
for(int j=0;j<100;j++) {
matrix [i, j, k] += rnd[2].Next(-1,2);
}
}
}
calcResults[2] = true;
}
}
}
void OnApplicationQuit(){
running = false;
}
}
I have an I7 4700k on my PC and another i7 on my $$anonymous$$acbook pro so 4 physical core.
Confirmed that the FPS doubles when going from 1 to 6 threads.
I checked, and going from 1 to 6 threads makes all four of my cores start working (ins$$anonymous$$d of just the 1), so you're not running into any issue where you're not getting the work you need out of your CPU. I would never expect a 6-fold improvement, unless you have 6+ cores and are working perfectly with the data.
You could try looking into the array layout. If your array is too large to be fetched from memory all at once, how you loop through it affects performance. See here for an example. It might simply be that the order you're iterating in the multi-thread version is worse than the order in the single-thread version.
I already thought that could be a problem related to memory, because in my project where I have much bigger class used in my array the performances improvement is equal to zero. In fact if in this test example I gain a x2 in performance with multithread, in my main project I have a gain equal to x1 in performance. $$anonymous$$y project use a my custom datatype that include 2 float and 1 vector3.
I made a three thread version that I write on the main post and it runs like 6 thread ones. How is it possible?
Possibly just an aside, I don't know if it would impact on what you're looking at, but it seems odd to me to have an OnGUI function, (especially with string stuff being done in it), when looking at performance. Why not just store the time when the process starts and then look at how much time has passed when it's finished?
Yes I know but that it's only an extreme needs of debugging obviously in the project should not be present. But actually I don't want to know the better fps i get, but I would like to know if doing multi thread I gain something or not, and if not why. So I put the same function in the OnGui on every version so they slowdown equally each version.
I guess my point is that the load on your CPUs is made up of (load due to the calculation) plus (other stuff including the engine). You're only modifying the first part of that. If you had 2 cores, you wouldn't expect putting the calculation into two threads to double the overall speed, would you? Because some of the processor time is being used for other stuff, you don't actually have 2 whole cores available to the calculation.
So the percentage change in performance overall isn't actually a measure of how much you gain from multithreading the calculation. You'd need to subtract the (constant) load due to the other stuff in order to work that out, or $$anonymous$$imise the other stuff (by eg removing the OnGUI calls) in order to make the overall performance gain a better measure of the gain obtained from restructuring the calculation.
Hi, I can see 2 things that are not cache friendly and must cripple the performances:
First, when accessing a table in memory, it's best to access it in a "linear" way so cache usage is maximized = for a 1d array: going from 0 to max and not the other direction will prevent cache miss. For your case: a 3 dimensional array is better accessed like this: if loops are: for i, then for j, then for k : array[i,j,k] = value. This is even more important for the final sum/divide operation on the main thread.
Second: all the threads write to the same matrix array. This is the worst for multi-core cache usage. Here is a good read about that: https://fgiesen.wordpress.com/2014/07/07/cache-coherency/. The way cache works is that if a thread modify a cache line, all other cores must discard this same cache line. The way the matrix is wrote too: all threads will disable cache lines all the time, resulting in a lot of cache miss & refresh. What I would do is have each thread have it's own matrix memory to write to, so cache line don't overlap for each thread. Then the main thread will sum each matrix in another one. If possible, sum one matrix at a time to maximize cache usage : a+=b, then a+=c, then a+=d and not a=a+b+c which might use more cache (well, this last optimization has to be tested, both options should be checked ;-)
It should not result in a perfect "x NbOfThread" speedup, multhreading never do that :-)
For the first part I'm ok with what you say and on the main thread I can do what you think, but on my project I need to move me on the matrix a long three different dimensions in the three for loop so I can't change order.
The second part I will try to that but I'm afraid to be memory limited to have 6 big matrix by one million of cell.
Your answer
Follow this Question
Related Questions
allocate specific cpu-cores for Unity 1 Answer
Unity Jobs and BURST low performance on Html5 1 Answer
Can i call an external method from a job? 0 Answers
Calculating InverseTransformPoint outside unity 1 Answer
GPU warm up 1 Answer