- Home /
simple Lipsync with microphone input
Hi, I want to build a very very simple lipsync animation using the microphone input. My idea was to get the volume from the microphone and move the jaw. Pretty much like this demo: http://qlcomp.com/?page_id=33 , but in real time. After some googling, I found people who know how to calculate the volume, so I adapted the code into:
private float GetVolume()
{
if(audio==null)
return 0;
float[] data = new float[samples];
audio.GetOutputData(data, 0);
//take the median of the recorded samples
ArrayList s = new ArrayList();
foreach (float f in data)
{
s.Add(Mathf.Abs(f));
}
s.Sort();
return (float)s[samples / 2];
}
And also, a function to filter noise:
float fMax = 24000;
private float HumanFreq(float fLow, float fHigh)
{
if(audio==null)
return 0;
float[] data = new float[samples];
fLow = Mathf.Clamp(fLow, 20, fMax); // limit low...
fHigh = Mathf.Clamp(fHigh, fLow, fMax); // and high frequencies
// get spectrum: freqData[n] = vol of frequency n * fMax / nSamples
audio.GetSpectrumData(data, 0, FFTWindow.BlackmanHarris);
int n1 = (int)Mathf.Floor(fLow * samples / fMax);
int n2 = (int)Mathf.Floor(fHigh * samples / fMax);
float sum = 0;
// average the volumes of frequencies fLow to fHigh
for (var i=n1; i<=n2; i++){
sum += data[i];
}
return sum / (n2 - n1 + 1);
}
My problem is that GetVolume returns 0 almost always (randomly not), and HumanFreq does not filter anything.
Does anyone know what's wrong? Or even any other way to do what I'm trying? I wouldn't mind completely changing the approach, but I need something simple. Phonem recognition and other techniques are much better looking but also much more complex. I also wouldn't mind buying assets if they were cheap and easy to use. (Yes, simple is the word :) I don't want to spend too much time on this)
Answer by Hannibalov · Jun 28, 2012 at 10:25 AM
using UnityEngine; using System.Collections.Generic; using System.Collections;
public class LipSync : MonoBehaviour {
private Transform mouth;
private Vector3 initialPos;
public Vector3 endPos;
private bool initialized = false;
private bool microphoneClip = false;
private AudioClip recordedClip;
private const int FREQUENCY = 48000;
private const int SAMPLECOUNT = 1024;
private const float REFVALUE = 0.1f; // RMS value for 0 dB.
private const float THRESHOLD = 0.02f; // Minimum amplitude to extract pitch (recieve anything)
private float[] samples; // Samples
private float[] spectrum; // Spectrum
private float rmsValue; // Volume in RMS
private float dbValue; // Volume in DB
private float pitchValue; // Pitch - Hz (is this frequency?)
public int clamp = 160; // Used to clamp dB
private float time;
private float elapsedTime;
public void Init ()
{
if(initialized)
return;
samples = new float[SAMPLECOUNT];
spectrum = new float[SAMPLECOUNT];
mouth = transform;
initialPos = mouth.localPosition;
if (audio == null)
gameObject.AddComponent<AudioSource> ();
audio.playOnAwake = false;
audio.loop = false;
enabled = false;
initialized = true;
}
public void StartClipLipSync (AudioClip clip, bool fromMicrophone)
{
Init ();
audio.clip = clip;
audio.Play ();
enabled = true;
microphoneClip = fromMicrophone;
Debug.Log ("Clip started for " + transform.root.gameObject.name);
}
public float PauseClip ()
{
audio.Pause ();
return audio.time;
}
public void StopClip ()
{
Init ();
audio.Stop ();
}
public void RestartClip (AudioClip clip, float t)
{
Init ();
audio.clip = clip;
audio.time = t;
audio.Play ();
}
public void RestartClip (float t)
{
Init ();
audio.time = t;
audio.Play ();
}
public void RestartClip ()
{
Init ();
audio.Stop ();
audio.Play ();
}
// Use this for initialization
public void StartMicrophoneLipSync ()
{
Init ();
if (Microphone.devices.Length > 0) {
audio.clip = Microphone.Start ("Built-in Microphone", true, 999, 44100);
while (!(Microphone.GetPosition("Built-in Microphone") > 0)) {
}
audio.Play ();
audio.mute = true;
enabled = true;
Debug.Log ("Microphone started for " + transform.root.gameObject.name);
} else
enabled = false;
microphoneClip = true;
}
public AudioClip GetRecordedClip ()
{
return recordedClip;
}
void OnDestroy ()
{
if (Microphone.IsRecording (null))
Microphone.End (null);
Destroy (audio);
}
public void StopMicrophone ()
{
if (audio == null)
return;
if (Microphone.IsRecording (null))
Microphone.End (null);
recordedClip = audio.clip;
DestroyImmediate (audio);
initialized = false;
}
//This could be programmed as InvokeRepeating
void Update ()
{
AnalyzeSound();
if (mouth == null)
return;
float freq = HumanFreq(200,800)*1000;
if(freq>2 && freq <50) {
float step = Mathf.SmoothStep(0, 1, Mathf.SmoothStep(0, 1, elapsedTime/time));
mouth.localPosition = Vector3.Lerp(initialPos, endPos,step);
elapsedTime += Time.deltaTime;
} else {
mouth.localPosition = initialPos;
time = 0.3f;
elapsedTime = 0;
}
}
/// Analyzes the sound, to get volume and pitch values.
private void AnalyzeSound ()
{
if(audio==null)
return;
// Get all of our samples from the mic.
audio.GetOutputData (samples, 0);
// Sums squared samples
float sum = 0;
for (int i = 0; i < SAMPLECOUNT; i++) {
sum += Mathf.Pow (samples [i], 2);
}
// RMS is the square root of the average value of the samples.
rmsValue = Mathf.Sqrt (sum / SAMPLECOUNT);
dbValue = 20 * Mathf.Log10 (rmsValue / REFVALUE);
// Clamp it to {clamp} min
if (dbValue < -clamp) {
dbValue = -clamp;
}
// Gets the sound spectrum.
audio.GetSpectrumData (spectrum, 0, FFTWindow.BlackmanHarris);
float maxV = 0;
int maxN = 0;
// Find the highest sample.
for (int i = 0; i < SAMPLECOUNT; i++) {
if (spectrum [i] > maxV && spectrum [i] > THRESHOLD) {
maxV = spectrum [i];
maxN = i; // maxN is the index of max
}
}
// Pass the index to a float variable
float freqN = maxN;
// Interpolate index using neighbours
if (maxN > 0 && maxN < SAMPLECOUNT - 1) {
float dL = spectrum [maxN - 1] / spectrum [maxN];
float dR = spectrum [maxN + 1] / spectrum [maxN];
freqN += 0.5f * (dR * dR - dL * dL);
}
// Convert index to frequency
pitchValue = freqN * 24000 / SAMPLECOUNT;
}
private float HumanFreq (float fLow, float fHigh)
{
int n1 = (int)Mathf.Floor (fLow * SAMPLECOUNT * 2 / FREQUENCY);
int n2 = (int)Mathf.Floor (fHigh * SAMPLECOUNT * 2 / FREQUENCY);
float sum = 0;
// average the volumes of frequencies fLow to fHigh
for (var i=n1; i<=n2; i++) {
sum += spectrum [i];
}
return sum / (n2 - n1 + 1);
}
}
By the way, I give the most credit to @Riro, as I basically just rewrote his script in http://goo.gl/oLTp5
Great you found a solution, but you really don't want to use $$anonymous$$athf.Pow
for computing a square. $$anonymous$$athf.Pow is very slow, just do this:
sum += samples [i] * samples [i];
@hirenkacha , yes, no problem for me. And thanks @Bunny83, I changed that
@hannibalov I realize this is over 2 years old at this point, but I'm just starting Unity and C# and I'm trying to make a real-time avatar for an interactive Halloween display this year. What do I need to do to get this script to actually initialize and do something?
I've copied it as-is and applied it to an object in Unity, but when I run my game, it doesn't do anything at all. Any help and hand-holding would be much appreciated, and if it helps at all, my object is using blendShapes for its facial animation, so I'd need this script to drive the mouth-opening blendShape.
Thanks!
For anyone who is trying to use this code, you need to set audio.loop = true. Right now it's set to false in the Init() function.
If this isn't true, when you play the audio clip and it reaches the end of the time (in this code it's 999), it'll stop playing.
Answer by Bunny83 · Jun 27, 2012 at 12:07 PM
You take just the center sample, that doesn't make much sense ;) the sample rate is much faster than you're scanning the data. So you pick a single sample every hundreds of samples and use it.
Here's the refactored script the sample project is using:
float[] array = new float[this.winWidth];
audioSource.GetOutputData(array, 0);
float num3 = (float)0;
for (int i = 0; i < this.winWidth; i++)
{
float num4 = Mathf.Abs(array[i]);
num3 += num4;
}
num3 /= (float)this.winWidth;
Note: The original script was written in UnityScript and had additionally a min and max calculation in the forloop, but the values aren't used anywhere, so i just copied the relevant part ;)
Sorry for the "bad" variable names, but local variable names doesn't exist in CIL. They are just values on the stack ;)
edit
btw. the winWidth defaults to 512 but it might be adjusted.
Ok, I changed the script to:
private float GetVolume() { if(audio==null) return 0; float[] data = new float[samples]; audio.GetOutputData(data, 0);
float sum =0;
foreach (float f in data)
{
sum += $$anonymous$$athf.Abs(f);
}
return sum/samples;
}
I'm printing the output each frame and samples = 512. The result is 0.1718733, 0.1718769, ... 0.17187xxx
no matter what I say to the microphone or even if I'm quiet. What's wrong?? For regular AudioClips it seems to work more or less ok, but for the microphone it's just a mess
I don't know how, but it seems i missed the microphone thing ;)
Unity doesn't provide realtime microphone data afaik...
http://answers.unity3d.com/questions/29916/audio-microphone-analysis.html
All you can do is recording with the $$anonymous$$icrophone API
GetOutputData is a function of the AudioSource, not of the inco$$anonymous$$g microphone data. If you have an AudioClip you can look ahead as far as you wish, but realtime applications always lag behind. I don't hink you can use GetOutputData in this case.
Do you even play the clip while recording?
Ok, it looks like sometimes Unity does not recognise the microphone properly (I guess depending if you plug it and unplug it or so). I spent hours desperate with ilogical results, and I just needed to restart it. I'll post my definite solution, tried and working both for clips and microphone