Source code for maelzel.snd.freqestimate

"""
Frequency estimation of a signal with different algorithms

The most important entry point is :func:`~maelzel.snd.freqestimate.f0curve`, which estimates
the fundamental frequency of an audio signal together with its voicedness (the reliability
of the measurement)

"""
from __future__ import annotations
import numpy as np
import bpf4
from math import ceil
from emlib.mathlib import nextpowerof2
from maelzel.common import getLogger


def _nextPow2(x: int) -> int:
    return 1 if x == 0 else 2**(x-1).bit_length()



[docs]
def parabolic(f: np.ndarray, x: int) -> tuple[float, float]:
    """
    Estimates inter-sample maximum

    Quadratic interpolation for estimating the true position of an
    inter-sample maximum when nearby samples are known.

    Args:
        f: a vector (an array of sampled values over a regular grid)
        x: index for that vector

    Returns:
        (vx, vy), the coordinates of the vertex of a parabola that goes
        through point x and its two neighbors.

    Example
    =======

    Defining a vector f with a local maximum at index 3 (= 6), find local
    maximum if points 2, 3, and 4 actually defined a parabola.

    >>> f = [2, 3, 1, 6, 4, 2, 3, 1]

    >>> parabolic(f, 3)
    Out[4]: (3.2142857142857144, 6.1607142857142856)

    """
    xv = 1/2. * (f[x-1] - f[x+1]) / (f[x-1] - 2 * f[x] + f[x+1]) + x
    yv = f[x] - 1/4. * (f[x-1] - f[x+1]) * (xv - x)
    return float(xv), float(yv)



def _find(condition) -> np.ndarray:
    """Return the indices where ravel(condition) is true"""
    res, = np.nonzero(np.ravel(condition))
    return res



[docs]
def f0ZeroCross(sig: np.ndarray, sr: int) -> tuple[float, float]:
    """Estimate frequency by counting zero crossings

    Args:
        sig: a sampled signal
        sr : sample rate

    Returns:
        the frequency of the signal
    """
    if _sigNumChannels(sig) > 1:
        raise ValueError("sig should be a mono signal")

    # Find all indices right before a rising-edge zero crossing
    indices = _find((sig[1:] >= 0) & (sig[:-1] < 0))

    # Naive (Measures 1000.185 Hz for 1000 Hz, for instance)
    # crossings = indices

    # More accurate, using linear interpolation to find intersample
    # zero-crossings (Measures 1000.000129 Hz for 1000 Hz, for instance)
    crossings = [i - sig[i] / (sig[i+1] - sig[i]) for i in indices]

    # Some other interpolation based on neighboring points might be better.
    # Spline, cubic, whatever
    return sr / float(np.mean(np.diff(crossings))), 1




[docs]
def f0FFT(sig: np.ndarray, sr: int) -> tuple[float, float]:
    """Estimate frequency from peak of FFT

    Args:
        sig: a sampled signal (mono)
        sr : sample rate

    Returns:
        the frequency of the signal, the probability that the sound
        frequency is valid
    """
    if _sigNumChannels(sig) > 1:
        raise ValueError("sig should be a mono signal")
    # Compute Fourier transform of windowed signal
    import scipy.signal

    windowed = sig * scipy.signal.windows.blackmanharris(len(sig))
    f = np.fft.rfft(windowed)

    # Find the peak and interpolate to get a more accurate peak
    i = int(np.argmax(abs(f)))    # Just use this for less-accurate, naive version
    true_i = parabolic(np.log(abs(f)), i)[0]

    # Convert to equivalent frequency
    return sr * true_i / len(windowed), 1




[docs]
def f0Autocorr(sig: np.ndarray, sr: int) -> tuple[float, float]:
    """
    Estimate frequency using autocorrelation

    Args:
        sig: a sampled signal
        sr : sample rate

    Returns:
        the frequency of the signal, the probability that the sound
        frequency is valid
    """
    if _sigNumChannels(sig) > 1:
        raise ValueError("sig should be a mono signal")

    # Calculate autocorrelation (same thing as convolution, but with
    # one input reversed in time), and throw away the negative lags

    import scipy.signal
    corr = scipy.signal.fftconvolve(sig, sig[::-1], mode='full')
    corr = corr[int(len(corr)/2):]

    # Find the first low point
    d = np.diff(corr)
    start = _find(d > 0)[0]

    # Find the next peak after the low point (other than 0 lag).  This bit is
    # not reliable for long signals, due to the desired peak occurring between
    # samples, and other peaks appearing higher.
    # Should use a weighting function to de-emphasize the peaks at longer lags.
    peak = np.argmax(corr[start:]) + start
    px, py = parabolic(corr, peak)
    return sr / px, 1




[docs]
def f0HPS(sig: np.ndarray, sr: int, maxharms=5) -> tuple[float, float]:
    """
    Estimate frequency using harmonic product spectrum (HPS)

    """
    if _sigNumChannels(sig) > 1:
        raise ValueError("sig should be a mono signal")

    import scipy.signal
    windowed = sig * scipy.signal.windows.blackmanharris(len(sig))
    c = abs(np.fft.rfft(windowed))
    freq = 0
    for x in range(2, maxharms):
        a = c[::x]  # Should average or maximum instead of decimating
        # a = max(c[::x],c[1::x],c[2::x])
        c = c[:len(a)]
        i = int(np.argmax(abs(c)))
        try:
            true_i = parabolic(abs(c), i)[0]
        except IndexError:
            return freq, 0
        freq = sr * true_i / len(windowed)
        c *= a
    return freq, 1




[docs]
def f0curvePyin(sig: np.ndarray, sr: int, minfreq=50, maxfreq=5000,
                framelength=2048, winlength=None, hoplength=512
                ) -> tuple[bpf4.Linear, bpf4.Linear]:
    """
    Calculate the fundamental based on the pyin method

    This routine is implemented in pure python and is VERY slow

    Args:
        sig: the array representing the soud
        sr: sample rate
        minfreq: lowest frequency to considere for f0
        maxfreq: highest frequecy to considere for f0
        framelength: the fft size
        winlength: the window length (must be smaller than framelength)
        hoplength: interval between measurements (in samples)

    Returns:
        a tuple (freq curve, voiced probability curve)

    """
    if _sigNumChannels(sig) > 1:
        raise ValueError("sig should be a mono signal")

    totaldur = len(sig) / sr

    from maelzel.snd import rosita
    f0, voiceg_flag, voiced_probs = rosita.pyin(sig, sr=sr,
                                                fmin=minfreq, fmax=maxfreq,
                                                frame_length=framelength,
                                                win_length=winlength,
                                                hop_length=hoplength)
    times = np.linspace(0, totaldur, len(f0))
    return bpf4.Linear(times, f0), bpf4.Linear(times, voiced_probs)




[docs]
def f0curvePyinVamp(sig: np.ndarray,
                    sr: int,
                    fftsize=2048,
                    overlap=4,
                    lowAmpSuppression=0.01,
                    onsetSensitivity=0.7,
                    pruneThreshold=0.1,
                    threshDistr='beta15',
                    unvoicedFreqs='nan'
                    ) -> tuple[bpf4.Linear, bpf4.Linear]:
    """
    Calculate the fundamental using the pyin vamp plugin

    Args:
        sig: the signal as numpy array
        sr: the sr
        fftsize: with sizes lower than 2048 the result might be unstable
        overlap: hop size as fftsize//overlap
        lowAmpSuppression: supress low amplitude pitch estimates, 0.01=-40dB, 0.001=-60dB
        onsetSensitivity: onset sensitivity
        pruneThreshold: totalDuration pruning threshold
        threshDistr: yin threshold distribution (see table below) - One of
            uniform, beta10, beta15, beta30, single10, single20-
        unvoicedFreqs: one of 'nan', 'negative'. If 'nan' unvoiced  frequencies
            (frequencies for segments where the f0 confidence is too low) are given
            as `nan`. If 'negative' unvoiced freqs are given as negative.

    Returns:
        a tuple (f0 bpf, probability bpf), where f0 is a bpf with the
        detected fundamental. Whenver the algorithms detects unvoiced
        (noise) or absence of a fundamental, the result is negative.

    ============   ============
    thresh_distr   Description
    ============   ============
    uniform        Uniform
    beta10         Beta (mean 0.10)
    beta15         Beta (mean 0.15)
    beta30         Beta (mean 0.30)
    single10       Single value 0.10
    single15       Single value 0.15
    single20       Single value 0.20
    ============   ============

    """
    if _sigNumChannels(sig) > 1:
        raise ValueError("sig should be a mono signal")

    if fftsize < 2048:
        getLogger(__file__).warning(f"The fft size ({fftsize}) is too small for f0 tracking, it needs to be"
                                    f" at least 2048. Using 2048 instead")
        fftsize = 2048

    from maelzel.snd import vamptools
    data = vamptools.pyinPitchTrack(sig, sr=sr, fftSize=fftsize, overlap=overlap,
                                    lowAmpSuppression=lowAmpSuppression,
                                    onsetSensitivity=onsetSensitivity,
                                    pruneThresh=pruneThreshold,
                                    threshDistr=threshDistr,
                                    outputUnvoiced=unvoicedFreqs)
    times = np.ascontiguousarray(data[:, 0])
    f0 = np.ascontiguousarray(data[:, 1])
    probs = np.ascontiguousarray(data[:, 2])
    return bpf4.Linear(times, f0), bpf4.Linear(times, probs)



def _sigNumChannels(sig: np.ndarray):
    return sig.shape[1] if len(sig.shape) > 1 else 1



[docs]
def frequencyToWindowSize(freq: int, sr: int, powerof2=False, factor=2.0) -> int:
    """
    Return the size of a window in samples which can fit the given frequency

    Args:
        freq: the lowest frequency to fit in the window
        sr: the sampling rate
        powerof2: if True, force the size to the next power of two
        factor: the factor used to fit to account for phase shifts. Without this
            the window will not fit an entire cycle of the given frequency unless
            the signal is in perfect sync with the window.

    Returns:
        the size of the window in samples
    """
    winsize = int(ceil(1/freq  * sr * factor))
    if powerof2:
        winsize = nextpowerof2(winsize)
    return winsize




[docs]
def f0curve(sig: np.ndarray, sr: int, minfreq=60, overlap=4,
            method='pyin', unvoicedFreqs='nan'
            ) -> tuple[bpf4.Linear, bpf4.Linear]:
    """
    Estimate the fundamental and its voicedness

    The voicedness curve indicates how accurate the frequency reading is. The
    value should drop whenever the signal is either too low (during silences)
    or it turns noisy (during consonants in the case of speech)

    This function combines all frequency estimation algorithms into one interface.
    For more detailed access to parameters of each method use the specific
    function (pyin vamp: :func:`f0curvePyinVamp`; pyin native: :func:`f0curvePyin`, etc)

    Both the f0 curve and the voicedness curve are returned as bpfs.
    See https://bpf4.readthedocs.io for more information about bpfs

    Args:
        sig: the signal, a 1D float numpy array with samples between -1:1
        sr: the sr
        minfreq: the min. frequency of the fundamental. Using this hint
            we estimate the best value for fft size / window size
        overlap: the amount of overlap between analysis windows
        method: the method used. One of 'pyin', 'fft', 'hps', 'autocorrelation'.
            For pyin make sure that you are using the vamp plugin, since the
            fallback version (using an implementation based on librosa's version
            of the algorithm) is very slow at the moment. **If the vamp plugin
            is not available you should see a warning**
        unvoicedFreqs: (only relevant for method 'pyin'). One of 'nan', 'negative'.
            If 'nan' unvoiced  frequencies (frequencies for segments where the
            f0 confidence is too low) are given as `nan`. If 'negative' unvoiced
            freqs are given as negative.


    Returns:
        a tuple (f0curve, f0voicedness)

    Example
    ~~~~~~~

    .. code-block:: python

        from maelzel.snd.audiosample import Sample
        from maelzel.snd import freqestimate
        samp = Sample("snd/finneganswake-fragm01.flac").getChannel(0, contiguous=True)
        freq, voiced = freqestimate.f0curve(sig=samp.samples, sr=samp.sr, method='pyin')
        freq.plot(figsize=(18, 6))

    .. image:: ../assets/snd-freqestimate-f0curve.png

    For a more in-depth example see ``notebooks/analysis-pyin.ipynb``
    """
    if _sigNumChannels(sig) > 1:
        raise ValueError("sig should be a mono signal")

    from maelzel.snd import vamptools
    if method == 'pyin':
        if vamptools.pyinAvailable():
            method = 'pyin-vamp'
        else:
            method = 'pyin-native'
            getLogger(__file__).warning("The pyin vamp plugin was not found. Falling back to"
                                        "the python version, this might be very slow")


    if method == 'pyin-native':
        winlength = frequencyToWindowSize(minfreq, sr=sr, powerof2=False)
        hoplength = winlength // overlap
        fftlength = nextpowerof2(winlength)
        return f0curvePyin(sig, sr, minfreq=minfreq,
                           framelength=fftlength,
                           winlength=winlength, hoplength=hoplength)
    elif method == 'pyin-vamp':
        fftsize = frequencyToWindowSize(minfreq, sr=sr, powerof2=True)
        return f0curvePyinVamp(sig, sr, fftsize=fftsize, overlap=overlap,
                               unvoicedFreqs=unvoicedFreqs)

    winsize = frequencyToWindowSize(minfreq, sr, powerof2=True)
    stepsize = winsize // overlap
    maxidx = len(sig) - winsize
    maxn = int(maxidx / stepsize)
    func = {
        'autocorrelation': f0Autocorr,
        'fft': f0FFT,
        'hps': f0HPS
    }[method]
    freqs, times, probs = [], [], []
    for n in range(maxn):
        idx = n * stepsize
        arr = sig[idx: idx+winsize]
        freq, prob = func(arr, sr)
        freqs.append(freq)
        times.append(idx/sr)
        probs.append(prob)
    return bpf4.Linear(times, freqs), bpf4.Linear(times, probs)




[docs]
def detectMinFrequency(samples: np.ndarray, sr: int, freqThreshold=30, overlap=4,
                       lowAmpSuppression=0.1,
                       refine=True
                       ) -> tuple[float, float]:
    """
    Detect the min. frequency in this audio sample

    Args:
        samples: the samples to analyze
        sr: the sample rate
        freqThreshold: the lowest freq. to considere
        overlap: the amount of overlap analysis per window
        lowAmpSuppression: how much to supress low amplitudes
        refine: if True, refine the freq and time data. This allows to
            performe a more coarse pass with a low overlap (for example, 4)
            and then a short pass with a higher overlap to obtain a more
            accurate value

    Returns:
        a tuple (min. freq, corresponding time) or (0., 0.) if no pitched sound was detected

    """
    from maelzel.snd import vamptools
    fftsize = frequencyToWindowSize(freqThreshold, sr=sr)
    f0data = vamptools.pyinPitchTrack(samples, sr=sr, fftSize=fftsize, overlap=overlap,
                                      lowAmpSuppression=lowAmpSuppression,
                                      outputUnvoiced='nan')
    freqs = f0data[:,1]
    mask = freqs > freqThreshold
    selected = freqs[mask]
    if len(selected) == 0:
        return 0., 0.
    idx = selected.argmin()
    time = float(f0data[:,0][mask][idx])
    if refine:
        margin = 0.2
        start = int((time - margin)*sr)
        end = int((time + margin)*sr)
        f, t = detectMinFrequency(samples[start:end], sr=sr, freqThreshold=freqThreshold,
                                  overlap=overlap*4, lowAmpSuppression=lowAmpSuppression,
                                  refine=False)
        return f, time - margin + t
    return float(selected[idx]), time