# Dependencies
import numpy as np
import essentia
import essentia.standard as ess
import matplotlib.pyplot as pyp
import IPython.display as ipd
import matplotlib
def genAudioTone(pitch, hopSize = 128, fs = 44100):
"""
Generates pure sine tone audio data from the pitch contour
Parameters
----------
pitch : list
pitch contour data as a list
Returns
-------
x : np.array
audio data to play or write wav files
"""
x = np.array([])
phase = 0.0
for frame_no in range(len(pitch)):
tone = 1.0 * np.cos((2*np.pi*pitch[frame_no]*np.arange(hopSize)/44100) + phase)
x = np.append(x, tone)
phase = phase + 2*np.pi*pitch[frame_no]*(hopSize-1)/fs
return x
# Load audio
Fs = 44100
audio1 = ess.MonoLoader(sampleRate = Fs,filename = 'audio1.wav')()
# Playing the loaded audio
ipd.Audio(audio1, rate=Fs)
# Plotting the Audio and its spectrogram
pyp.figure()
pyp.title('Time Domain Waveform of Audio')
pyp.xlabel('time')
pyp.ylabel('amplitude')
t = np.arange(len(audio1))*1.0/Fs
pyp.plot(t,audio1);
W = 1024
H = 512
w = ess.Windowing(type = 'hann')
fft = ess.FFT()
c2p = ess.CartesianToPolar()
pool = essentia.Pool()
acf = ess.AutoCorrelation()
for frame in ess.FrameGenerator(audio1, frameSize = W, hopSize = H):
mag, phase, = c2p(fft(w(frame)))
autocorr = acf(w(frame))
pool.add('mag_spec', mag)
pool.add('phase_spec', phase)
pool.add('acf',autocorr)
freq = np.arange((int)(W/2) + 1)*(Fs/W)
t = np.arange(len(pool['mag_spec']))*(len(audio1)/Fs)/(len(pool['mag_spec']))
pyp.imshow(pool['mag_spec'][:,0:64].T,origin = 'lower',aspect='auto',extent=[t[0],t[-1], freq[0],freq[64]])
pyp.colorbar()
pyp.title('Spectrogram')
pyp.xlabel('time')
pyp.ylabel('frequency');
r_{t}(\tau) = \sum_{j = t+1}^{t+W} x[j]x[j+\tau]
\end{align} d_{t}(\tau) = \sum_{j = 1}^{W} (x[j] - x[j+\tau])^{2} \label{2}
\end{align}# YIN Algorithm
# Calculation of MDF
mdf = np.zeros(200)
ti = 100
for i in range(200):
mdf[i] = pool['acf'][:,ti][0] + pool['acf'][:,ti + 1][0] - 2*pool['acf'][:,ti][i]
pyp.plot(mdf)
pyp.axhline(0.0000035,color = 'r')
pyp.axhline(-0.0000035,color = 'r')
pyp.title('Mean Difference Function')
pyp.xlabel('time difference')
pyp.ylabel('value');
# YIN from essentia
yin = ess.PitchYin(frameSize = W,interpolate = True,tolerance = 1)
pyt = []
ct = []
for frame in ess.FrameGenerator(audio1, frameSize = W, hopSize = H):
py,c = yin(frame)
pyt.append(py)
ct.append(c)
pyt = np.asarray(pyt)
ct = np.asarray(ct)
pyp.imshow(pool['mag_spec'][:,0:64].T,origin = 'lower',aspect='auto')
pyp.colorbar()
pyp.plot(pyt*(ct>0.5)*(W/Fs),color = 'r')
pyp.title('Pitch on top of Spectrogram');
# Audio corresponding to extracted pitch
ipd.Audio(genAudioTone(pyt*(ct>0.5),H,Fs), rate=Fs)
audio_ex = ess.MonoLoader(sampleRate = Fs,filename = 'small_lyrics.wav')()
# Playing the loaded audio
ipd.Audio(audio_ex, rate=Fs)
W = 1024
H = 512
w = ess.Windowing(type = 'hann')
fft = ess.FFT()
c2p = ess.CartesianToPolar()
pool1 = essentia.Pool()
acf = ess.AutoCorrelation()
for frame in ess.FrameGenerator(audio_ex, frameSize = W, hopSize = H):
mag, phase, = c2p(fft(w(frame)))
autocorr = acf(w(frame))
pool1.add('mag_spec', mag)
pool1.add('phase_spec', phase)
pool1.add('acf',autocorr)
freq = np.arange((int)(W/2) + 1)*(Fs/W)
t = np.arange(len(pool1['mag_spec']))*(len(audio_ex)/Fs)/(len(pool1['mag_spec']))
pyp.imshow(pool1['mag_spec'][:,0:64].T,origin = 'lower',aspect='auto',extent=[t[0],t[-1], freq[0],freq[64]])
pyp.colorbar()
pyp.title('Spectrogram')
pyp.xlabel('time')
pyp.ylabel('frequency');
# YIN from essentia
yin = ess.PitchYin(frameSize = W,interpolate = True,tolerance = 1)
pyt = []
ct = []
for frame in ess.FrameGenerator(audio_ex, frameSize = W, hopSize = H):
py,c = yin(frame)
pyt.append(py)
ct.append(c)
pyt = np.asarray(pyt)
ct = np.asarray(ct)
pyp.plot(pyt*(ct>0.2)*(W/Fs),color = 'r');
ipd.Audio(genAudioTone(pyt*(ct>0.2),H,Fs), rate=Fs)
As we can see in the above example, the YIN algorithm fails to extract the pitch correctly due to its variable nature.
This is one of the major motivations to move onto the P-YIN algorithm. It follows the same steps as the YIN upto the extraction of the MDF, however, after this stage, the procedure is changed.
# PYIN
# Considering same MDF from before -
from scipy.stats import beta
a, b = 2.30984964515, 0.62687954301
rv1 = beta(2, 18)
rv2 = beta(2,11.33)
rv3 = beta(2,8)
x = np.linspace(0,1,100)
pyp.figure()
pyp.plot(x,rv1.pdf(x),label = 'mean = 0.10')
pyp.plot(x,rv2.pdf(x),label = 'mean = 0.15')
pyp.plot(x,rv3.pdf(x),label = 'mean = 0.2')
pyp.legend(loc = 'best')
pyp.title('Beta Distribution Prior')
pyp.xlabel('Threshold value')
pyp.ylabel('Density value');
P(\tau = \tau_0) = \sum_{i}P(\tau = \tau_0|s_i)P(s_i)
\end{align}Now, this is where the authors use the concept of a Markov Model to model the underlying dynamics of the pitch evolution. They discretize the pitch space, and use the above obtained probabilities as the emission probabilities in a Hidden Markov Model. Thus, the task boils down to finding the most likely pitch sequence given the emissions. This is done using the Viterbi algorithm for the Maximum Likelihood Sequence Estimation.
# PYin using essentia
pyin = ess.PitchYinProbabilistic(frameSize = W,hopSize = H,lowRMSThreshold = 0.0001,preciseTime = True)
pyin_p,vp = pyin(audio_ex)
pyp.figure()
pyp.plot(pyin_p[pyin_p>0],color = 'r')
pyp.title('Pitch Contour')
pyp.xlabel('Time')
pyp.ylabel('Pitch');
pyp.figure()
pyp.imshow(pool1['mag_spec'][:,0:64].T,origin = 'lower',aspect='auto')
pyp.colorbar()
pyp.plot(pyin_p[pyin_p>0]*(W/Fs),color = 'r');
# Pitch Track generated by P-YIN
ipd.Audio(genAudioTone(pyin_p[pyin_p>0],H,Fs), rate=Fs)