"""A few utils (specshow, melspectrogram) vendored from librosa.
This code was copied from parts of librosa, and adapted, so as to be able to use
targeted functionality with less dependencies and manual installation
(namely for libsndfile) than librosa has.
Librosa can be found here: https://librosa.org/
Librosa's license follows.
ISC License
Copyright (c) 2013--2017, librosa development team.
Permission to use, copy, modify, and/or distribute this software for any purpose with or
without fee is hereby granted, provided that the above copyright notice and this
permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT,
OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
"""
import warnings
import numpy as np
from matplotlib.cm import get_cmap
from matplotlib.axes import Axes
from matplotlib.ticker import Formatter, ScalarFormatter
from matplotlib.ticker import MaxNLocator
from matplotlib.ticker import SymmetricalLogLocator
import matplotlib
from packaging.version import parse as version_parse
import scipy
import scipy.signal
from numpy.lib.stride_tricks import as_strided
import re
MAX_MEM_BLOCK = 2 ** 8 * 2 ** 10
# specshow
def specshow(
data,
x_coords=None,
y_coords=None,
x_axis=None,
y_axis=None,
sr=22050,
hop_length=512,
fmin=None,
fmax=None,
tuning=0.0,
bins_per_octave=12,
key='C:maj',
Sa=None,
mela=None,
thaat=None,
auto_aspect=True,
htk=False,
ax=None,
**kwargs,
):
if np.issubdtype(data.dtype, np.complexfloating):
warnings.warn(
'Trying to display complex-valued input. ' 'Showing magnitude instead.'
)
data = np.abs(data)
kwargs.setdefault('cmap', cmap(data))
kwargs.setdefault('rasterized', True)
kwargs.setdefault('edgecolors', 'None')
kwargs.setdefault('shading', 'flat')
all_params = dict(
kwargs=kwargs,
sr=sr,
fmin=fmin,
fmax=fmax,
tuning=tuning,
bins_per_octave=bins_per_octave,
hop_length=hop_length,
key=key,
htk=htk,
)
# Get the x and y coordinates
y_coords = __mesh_coords(y_axis, y_coords, data.shape[0], **all_params)
x_coords = __mesh_coords(x_axis, x_coords, data.shape[1], **all_params)
axes = __check_axes(ax)
out = axes.pcolormesh(x_coords, y_coords, data, **kwargs)
__set_current_image(ax, out)
axes.set_xlim(x_coords.min(), x_coords.max())
axes.set_ylim(y_coords.min(), y_coords.max())
# Set up axis scaling
__scale_axes(axes, x_axis, 'x')
__scale_axes(axes, y_axis, 'y')
# Construct tickers and locators
__decorate_axis(axes.xaxis, x_axis, key=key, Sa=Sa, mela=mela, thaat=thaat)
__decorate_axis(axes.yaxis, y_axis, key=key, Sa=Sa, mela=mela, thaat=thaat)
# If the plot is a self-similarity/covariance etc. plot, square it
if __same_axes(x_axis, y_axis, axes.get_xlim(), axes.get_ylim()) and auto_aspect:
axes.set_aspect('equal')
return out
def cmap(data, robust=True, cmap_seq='magma', cmap_bool='gray_r', cmap_div='coolwarm'):
data = np.atleast_1d(data)
if data.dtype == 'bool':
return get_cmap(cmap_bool, lut=2)
data = data[np.isfinite(data)]
if robust:
min_p, max_p = 2, 98
else:
min_p, max_p = 0, 100
min_val, max_val = np.percentile(data, [min_p, max_p])
if min_val >= 0 or max_val <= 0:
return get_cmap(cmap_seq)
return get_cmap(cmap_div)
def __mesh_coords(ax_type, coords, n, **kwargs):
"""Compute axis coordinates"""
if coords is not None:
if len(coords) < n:
raise Exception(
'Coordinate shape mismatch: ' '{}<{}'.format(len(coords), n)
)
return coords
coord_map = {
'linear': __coord_fft_hz,
'fft': __coord_fft_hz,
'fft_note': __coord_fft_hz,
'fft_svara': __coord_fft_hz,
'hz': __coord_fft_hz,
'log': __coord_fft_hz,
'mel': __coord_mel_hz,
'cqt': __coord_cqt_hz,
'cqt_hz': __coord_cqt_hz,
'cqt_note': __coord_cqt_hz,
'cqt_svara': __coord_cqt_hz,
'chroma': __coord_chroma,
'chroma_c': __coord_chroma,
'chroma_h': __coord_chroma,
'time': __coord_time,
's': __coord_time,
'ms': __coord_time,
'lag': __coord_time,
'lag_s': __coord_time,
'lag_ms': __coord_time,
'tonnetz': __coord_n,
'off': __coord_n,
'tempo': __coord_tempo,
'fourier_tempo': __coord_fourier_tempo,
'frames': __coord_n,
None: __coord_n,
}
if ax_type not in coord_map:
raise Exception('Unknown axis type: {}'.format(ax_type))
return coord_map[ax_type](n, **kwargs)
def __coord_fourier_tempo(n, sr=22050, hop_length=512, **_kwargs):
"""Fourier tempogram coordinates"""
n_fft = 2 * (n - 1)
# The following code centers the FFT bins at their frequencies
# and clips to the non-negative frequency range [0, nyquist]
basis = fourier_tempo_frequencies(sr=sr, hop_length=hop_length, win_length=n_fft)
fmax = basis[-1]
basis -= 0.5 * (basis[1] - basis[0])
basis = np.append(np.maximum(0, basis), [fmax])
return basis
def fourier_tempo_frequencies(sr=22050, win_length=384, hop_length=512):
# sr / hop_length gets the frame rate
# multiplying by 60 turns frames / sec into frames / minute
return fft_frequencies(sr=sr * 60 / float(hop_length), n_fft=win_length)
def frames_to_time(frames, sr=22050, hop_length=512, n_fft=None):
samples = frames_to_samples(frames, hop_length=hop_length, n_fft=n_fft)
return samples_to_time(samples, sr=sr)
def samples_to_time(samples, sr=22050):
return np.asanyarray(samples) / float(sr)
def frames_to_samples(frames, hop_length=512, n_fft=None):
offset = 0
if n_fft is not None:
offset = int(n_fft // 2)
return (np.asanyarray(frames) * hop_length + offset).astype(int)
def __coord_time(n, sr=22050, hop_length=512, **_kwargs):
"""Get time coordinates from frames"""
return frames_to_time(np.arange(n + 1), sr=sr, hop_length=hop_length)
def __coord_chroma(n, bins_per_octave=12, **_kwargs):
"""Get chroma bin numbers"""
return np.linspace(0, (12.0 * n) / bins_per_octave, num=n + 1, endpoint=True)
def tempo_frequencies(n_bins, hop_length=512, sr=22050):
bin_frequencies = np.zeros(int(n_bins), dtype=np.float)
bin_frequencies[0] = np.inf
bin_frequencies[1:] = 60.0 * sr / (hop_length * np.arange(1.0, n_bins))
return bin_frequencies
def __coord_tempo(n, sr=22050, hop_length=512, **_kwargs):
"""Tempo coordinates"""
basis = tempo_frequencies(n + 2, sr=sr, hop_length=hop_length)[1:]
edges = np.arange(1, n + 2)
return basis * (edges + 0.5) / edges
def __coord_cqt_hz(n, fmin=None, bins_per_octave=12, sr=22050, **_kwargs):
"""Get CQT bin frequencies"""
if fmin is None:
fmin = note_to_hz('C1')
# Apply tuning correction
fmin = fmin * 2.0 ** (_kwargs.get('tuning', 0.0) / bins_per_octave)
# we drop by half a bin so that CQT bins are centered vertically
freqs = cqt_frequencies(
n + 1,
fmin=fmin / 2.0 ** (0.5 / bins_per_octave),
bins_per_octave=bins_per_octave,
)
if np.any(freqs > 0.5 * sr):
warnings.warn(
'Frequency axis exceeds Nyquist. '
'Did you remember to set all spectrogram parameters in specshow?'
)
return freqs
def cqt_frequencies(n_bins, fmin, bins_per_octave=12, tuning=0.0):
correction = 2.0 ** (float(tuning) / bins_per_octave)
frequencies = 2.0 ** (np.arange(0, n_bins, dtype=float) / bins_per_octave)
return correction * fmin * frequencies
def note_to_hz(note, **kwargs):
return midi_to_hz(note_to_midi(note, **kwargs))
def midi_to_hz(notes):
return 440.0 * (2.0 ** ((np.asanyarray(notes) - 69.0) / 12.0))
def note_to_midi(note, round_midi=True):
if not isinstance(note, str):
return np.array([note_to_midi(n, round_midi=round_midi) for n in note])
pitch_map = {'C': 0, 'D': 2, 'E': 4, 'F': 5, 'G': 7, 'A': 9, 'B': 11}
acc_map = {
'#': 1,
'': 0,
'b': -1,
'!': -1,
'♯': 1,
'𝄪': 2,
'♭': -1,
'𝄫': -2,
'♮': 0,
}
match = re.match(
r'^(?P<note>[A-Ga-g])'
r'(?P<accidental>[#♯𝄪b!♭𝄫♮]*)'
r'(?P<octave>[+-]?\d+)?'
r'(?P<cents>[+-]\d+)?$',
note,
)
if not match:
raise Exception('Improper note format: {:s}'.format(note))
pitch = match.group('note').upper()
offset = np.sum([acc_map[o] for o in match.group('accidental')])
octave = match.group('octave')
cents = match.group('cents')
if not octave:
octave = 0
else:
octave = int(octave)
if not cents:
cents = 0
else:
cents = int(cents) * 1e-2
note_value = 12 * (octave + 1) + pitch_map[pitch] + offset + cents
if round_midi:
note_value = int(np.round(note_value))
return note_value
def __coord_n(n, **_kwargs):
"""Get bare positions"""
return np.arange(n + 1)
def __coord_mel_hz(n, fmin=0, fmax=None, sr=22050, htk=False, **_kwargs):
"""Get the frequencies for Mel bins"""
if fmin is None:
fmin = 0
if fmax is None:
fmax = 0.5 * sr
basis = mel_frequencies(n, fmin=fmin, fmax=fmax, htk=htk)
basis[1:] -= 0.5 * np.diff(basis)
basis = np.append(np.maximum(0, basis), [fmax])
return basis
def __coord_fft_hz(n, sr=22050, **_kwargs):
"""Get the frequencies for FFT bins"""
n_fft = 2 * (n - 1)
# The following code centers the FFT bins at their frequencies
# and clips to the non-negative frequency range [0, nyquist]
basis = fft_frequencies(sr=sr, n_fft=n_fft)
fmax = basis[-1]
basis -= 0.5 * (basis[1] - basis[0])
basis = np.append(np.maximum(0, basis), [fmax])
return basis
def __check_axes(axes):
"""Check if "axes" is an instance of an axis object. If not, use `gca`."""
if axes is None:
import matplotlib.pyplot as plt
axes = plt.gca()
elif not isinstance(axes, Axes):
raise Exception(
'`axes` must be an instance of matplotlib.axes.Axes. '
'Found type(axes)={}'.format(type(axes))
)
return axes
def __set_current_image(ax, img):
"""Helper to set the current image in pyplot mode.
If the provided ``ax`` is not `None`, then we assume that the user is using the object API.
In this case, the pyplot current image is not set.
"""
if ax is None:
import matplotlib.pyplot as plt
plt.sci(img)
def __scale_axes(axes, ax_type, which):
"""Set the axis scaling"""
kwargs = dict()
if which == 'x':
if version_parse(matplotlib.__version__) < version_parse('3.3.0'):
thresh = 'linthreshx'
base = 'basex'
scale = 'linscalex'
else:
thresh = 'linthresh'
base = 'base'
scale = 'linscale'
scaler = axes.set_xscale
limit = axes.set_xlim
else:
if version_parse(matplotlib.__version__) < version_parse('3.3.0'):
thresh = 'linthreshy'
base = 'basey'
scale = 'linscaley'
else:
thresh = 'linthresh'
base = 'base'
scale = 'linscale'
scaler = axes.set_yscale
limit = axes.set_ylim
# Map ticker scales
if ax_type == 'mel':
mode = 'symlog'
kwargs[thresh] = 1000.0
kwargs[base] = 2
elif ax_type in ['cqt', 'cqt_hz', 'cqt_note', 'cqt_svara']:
mode = 'log'
kwargs[base] = 2
elif ax_type in ['log', 'fft_note', 'fft_svara']:
mode = 'symlog'
kwargs[base] = 2
# kwargs[thresh] = core.note_to_hz(
# 'C2'
# ) # in librosa/core.py but I don't think it is needed
kwargs[scale] = 0.5
elif ax_type in ['tempo', 'fourier_tempo']:
mode = 'log'
kwargs[base] = 2
limit(16, 480)
else:
return
scaler(mode, **kwargs)
def __decorate_axis(axis, ax_type, key='C:maj', Sa=None, mela=None, thaat=None):
"""Configure axis tickers, locators, and labels"""
if ax_type == 'time':
axis.set_major_formatter(TimeFormatter(unit=None, lag=False))
axis.set_major_locator(MaxNLocator(prune=None, steps=[1, 1.5, 5, 6, 10]))
axis.set_label_text('Time')
elif ax_type in ['mel', 'log']:
axis.set_major_formatter(ScalarFormatter())
axis.set_major_locator(SymmetricalLogLocator(axis.get_transform()))
axis.set_label_text('Hz')
def __same_axes(x_axis, y_axis, xlim, ylim):
"""Check if two axes are the same, used to determine squared plots"""
axes_same_and_not_none = (x_axis == y_axis) and (x_axis is not None)
axes_same_lim = xlim == ylim
return axes_same_and_not_none and axes_same_lim
# librosa.feature.melspectrogram
def melspectrogram(
y=None,
sr=22050,
S=None,
n_fft=2048,
hop_length=512,
win_length=None,
window='hann',
center=True,
pad_mode='reflect',
power=2.0,
**kwargs,
):
S, n_fft = _spectrogram(
y=y,
S=S,
n_fft=n_fft,
hop_length=hop_length,
power=power,
win_length=win_length,
window=window,
center=center,
pad_mode=pad_mode,
)
# Build a Mel filter
mel_basis = mel(sr, n_fft, **kwargs)
return np.dot(mel_basis, S)
def _spectrogram(
y=None,
S=None,
n_fft=2048,
hop_length=512,
power=1,
win_length=None,
window='hann',
center=True,
pad_mode='reflect',
):
if S is not None:
# Infer n_fft from spectrogram shape
n_fft = 2 * (S.shape[0] - 1)
else:
# Otherwise, compute a magnitude spectrogram from input
S = (
np.abs(
stft(
y,
n_fft=n_fft,
hop_length=hop_length,
win_length=win_length,
center=center,
window=window,
pad_mode=pad_mode,
)
)
** power
)
return S, n_fft
def stft(
y,
n_fft=2048,
hop_length=None,
win_length=None,
window='hann',
center=True,
dtype=None,
pad_mode='reflect',
):
# By default, use the entire frame
if win_length is None:
win_length = n_fft
# Set the default hop, if it's not already specified
if hop_length is None:
hop_length = int(win_length // 4)
fft_window = get_window(window, win_length, fftbins=True)
# Pad the window out to n_fft size
fft_window = pad_center(fft_window, n_fft)
# Reshape so that the window can be broadcast
fft_window = fft_window.reshape((-1, 1))
# Check audio is valid
valid_audio(y)
# Pad the time series so that frames are centered
if center:
if n_fft > y.shape[-1]:
warnings.warn(
'n_fft={} is too small for input signal of length={}'.format(
n_fft, y.shape[-1]
)
)
y = np.pad(y, int(n_fft // 2), mode=pad_mode)
elif n_fft > y.shape[-1]:
raise Exception(
'n_fft={} is too large for input signal of length={}'.format(
n_fft, y.shape[-1]
)
)
# Window the time series.
y_frames = frame(y, frame_length=n_fft, hop_length=hop_length)
if dtype is None:
dtype = dtype_r2c(y.dtype)
# Pre-allocate the STFT matrix
stft_matrix = np.empty(
(int(1 + n_fft // 2), y_frames.shape[1]), dtype=dtype, order='F'
)
fft = get_fftlib()
# how many columns can we fit within MAX_MEM_BLOCK?
n_columns = MAX_MEM_BLOCK // (stft_matrix.shape[0] * stft_matrix.itemsize)
n_columns = max(n_columns, 1)
for bl_s in range(0, stft_matrix.shape[1], n_columns):
bl_t = min(bl_s + n_columns, stft_matrix.shape[1])
stft_matrix[:, bl_s:bl_t] = fft.rfft(
fft_window * y_frames[:, bl_s:bl_t], axis=0
)
return stft_matrix
def get_window(window, Nx, fftbins=True):
if callable(window):
return window(Nx)
elif isinstance(window, (str, tuple)) or np.isscalar(window):
# TODO: if we add custom window functions in librosa, call them here
return scipy.signal.get_window(window, Nx, fftbins=fftbins)
elif isinstance(window, (np.ndarray, list)):
if len(window) == Nx:
return np.asarray(window)
raise Exception('Window size mismatch: ' '{:d} != {:d}'.format(len(window), Nx))
else:
raise Exception('Invalid window specification: {}'.format(window))
def pad_center(data, size, axis=-1, **kwargs):
kwargs.setdefault('mode', 'constant')
n = data.shape[axis]
lpad = int((size - n) // 2)
lengths = [(0, 0)] * data.ndim
lengths[axis] = (lpad, int(size - n - lpad))
if lpad < 0:
raise Exception(
('Target size ({:d}) must be ' 'at least input size ({:d})').format(size, n)
)
return np.pad(data, lengths, **kwargs)
def valid_audio(y, mono=True):
if not isinstance(y, np.ndarray):
raise Exception('Audio data must be of type numpy.ndarray')
if not np.issubdtype(y.dtype, np.floating):
raise Exception('Audio data must be floating-point')
if mono and y.ndim != 1:
raise Exception(
'Invalid shape for monophonic audio: '
'ndim={:d}, shape={}'.format(y.ndim, y.shape)
)
elif y.ndim > 2 or y.ndim == 0:
raise Exception(
'Audio data must have shape (samples,) or (channels, samples). '
'Received shape={}'.format(y.shape)
)
elif y.ndim == 2 and y.shape[0] < 2:
raise Exception(
'Mono data must have shape (samples,). ' 'Received shape={}'.format(y.shape)
)
if not np.isfinite(y).all():
raise Exception('Audio buffer is not finite everywhere')
return True
def frame(x, frame_length, hop_length, axis=-1):
if not isinstance(x, np.ndarray):
raise Exception(
'Input must be of type numpy.ndarray, ' 'given type(x)={}'.format(type(x))
)
if x.shape[axis] < frame_length:
raise Exception(
'Input is too short (n={:d})'
' for frame_length={:d}'.format(x.shape[axis], frame_length)
)
if hop_length < 1:
raise Exception('Invalid hop_length: {:d}'.format(hop_length))
if axis == -1 and not x.flags['F_CONTIGUOUS']:
warnings.warn(
'librosa.util.frame called with axis={} '
'on a non-contiguous input. This will result in a copy.'.format(axis)
)
x = np.asfortranarray(x)
elif axis == 0 and not x.flags['C_CONTIGUOUS']:
warnings.warn(
'librosa.util.frame called with axis={} '
'on a non-contiguous input. This will result in a copy.'.format(axis)
)
x = np.ascontiguousarray(x)
n_frames = 1 + (x.shape[axis] - frame_length) // hop_length
strides = np.asarray(x.strides)
new_stride = np.prod(strides[strides > 0] // x.itemsize) * x.itemsize
if axis == -1:
shape = list(x.shape)[:-1] + [frame_length, n_frames]
strides = list(strides) + [hop_length * new_stride]
elif axis == 0:
shape = [n_frames, frame_length] + list(x.shape)[1:]
strides = [hop_length * new_stride] + list(strides)
else:
raise Exception('Frame axis={} must be either 0 or -1'.format(axis))
return as_strided(x, shape=shape, strides=strides)
def dtype_r2c(d, default=np.complex64):
mapping = {
np.dtype(np.float32): np.complex64,
np.dtype(np.float64): np.complex128,
np.dtype(np.float): np.complex,
}
# If we're given a complex type already, return it
dt = np.dtype(d)
if dt.kind == 'c':
return dt
# Otherwise, try to map the dtype.
# If no match is found, return the default.
return np.dtype(mapping.get(dt, default))
def get_fftlib():
global __FFTLIB
return __FFTLIB
def set_fftlib(lib=None):
global __FFTLIB
if lib is None:
from numpy import fft
lib = fft
__FFTLIB = lib
set_fftlib(None)
def mel(
sr,
n_fft,
n_mels=128,
fmin=0.0,
fmax=None,
htk=False,
norm='slaney',
dtype=np.float32,
):
if fmax is None:
fmax = float(sr) / 2
# Initialize the weights
n_mels = int(n_mels)
weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
# Center freqs of each FFT bin
fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft)
# 'Center freqs' of mel bands - uniformly spaced between limits
mel_f = mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax, htk=htk)
fdiff = np.diff(mel_f)
ramps = np.subtract.outer(mel_f, fftfreqs)
for i in range(n_mels):
# lower and upper slopes for all bins
lower = -ramps[i] / fdiff[i]
upper = ramps[i + 2] / fdiff[i + 1]
# .. then intersect them with each other and zero
weights[i] = np.maximum(0, np.minimum(lower, upper))
if norm == 'slaney':
# Slaney-style mel is scaled to be approx constant energy per channel
enorm = 2.0 / (mel_f[2 : n_mels + 2] - mel_f[:n_mels])
weights *= enorm[:, np.newaxis]
else:
weights = normalize(weights, norm=norm, axis=-1)
# Only check weights if f_mel[0] is positive
if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)):
# This means we have an empty channel somewhere
warnings.warn(
'Empty filters detected in mel frequency basis. '
'Some channels will produce empty responses. '
'Try increasing your sampling rate (and fmax) or '
'reducing n_mels.'
)
return weights
def fft_frequencies(sr=22050, n_fft=2048):
return np.linspace(0, float(sr) / 2, int(1 + n_fft // 2), endpoint=True)
def mel_frequencies(n_mels=128, fmin=0.0, fmax=11025.0, htk=False):
# 'Center freqs' of mel bands - uniformly spaced between limits
min_mel = hz_to_mel(fmin, htk=htk)
max_mel = hz_to_mel(fmax, htk=htk)
mels = np.linspace(min_mel, max_mel, n_mels)
return mel_to_hz(mels, htk=htk)
def hz_to_mel(frequencies, htk=False):
frequencies = np.asanyarray(frequencies)
if htk:
return 2595.0 * np.log10(1.0 + frequencies / 700.0)
# Fill in the linear part
f_min = 0.0
f_sp = 200.0 / 3
mels = (frequencies - f_min) / f_sp
# Fill in the log-scale part
min_log_hz = 1000.0 # beginning of log region (Hz)
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
logstep = np.log(6.4) / 27.0 # step size for log region
if frequencies.ndim:
# If we have array data, vectorize
log_t = frequencies >= min_log_hz
mels[log_t] = min_log_mel + np.log(frequencies[log_t] / min_log_hz) / logstep
elif frequencies >= min_log_hz:
# If we have scalar data, heck directly
mels = min_log_mel + np.log(frequencies / min_log_hz) / logstep
return mels
def mel_to_hz(mels, htk=False):
mels = np.asanyarray(mels)
if htk:
return 700.0 * (10.0 ** (mels / 2595.0) - 1.0)
# Fill in the linear scale
f_min = 0.0
f_sp = 200.0 / 3
freqs = f_min + f_sp * mels
# And now the nonlinear scale
min_log_hz = 1000.0 # beginning of log region (Hz)
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
logstep = np.log(6.4) / 27.0 # step size for log region
if mels.ndim:
# If we have vector data, vectorize
log_t = mels >= min_log_mel
freqs[log_t] = min_log_hz * np.exp(logstep * (mels[log_t] - min_log_mel))
elif mels >= min_log_mel:
# If we have scalar data, check directly
freqs = min_log_hz * np.exp(logstep * (mels - min_log_mel))
return freqs
def normalize(S, norm=np.inf, axis=0, threshold=None, fill=None):
# Avoid div-by-zero
if threshold is None:
threshold = tiny(S)
elif threshold <= 0:
raise Exception('threshold={} must be strictly ' 'positive'.format(threshold))
if fill not in [None, False, True]:
raise Exception('fill={} must be None or boolean'.format(fill))
if not np.all(np.isfinite(S)):
raise Exception('Input must be finite')
# All norms only depend on magnitude, let's do that first
mag = np.abs(S).astype(np.float)
# For max/min norms, filling with 1 works
fill_norm = 1
if norm == np.inf:
length = np.max(mag, axis=axis, keepdims=True)
elif norm == -np.inf:
length = np.min(mag, axis=axis, keepdims=True)
elif norm == 0:
if fill is True:
raise Exception('Cannot normalize with norm=0 and fill=True')
length = np.sum(mag > 0, axis=axis, keepdims=True, dtype=mag.dtype)
elif np.issubdtype(type(norm), np.number) and norm > 0:
length = np.sum(mag ** norm, axis=axis, keepdims=True) ** (1.0 / norm)
if axis is None:
fill_norm = mag.size ** (-1.0 / norm)
else:
fill_norm = mag.shape[axis] ** (-1.0 / norm)
elif norm is None:
return S
else:
raise Exception('Unsupported norm: {}'.format(repr(norm)))
# indices where norm is below the threshold
small_idx = length < threshold
Snorm = np.empty_like(S)
if fill is None:
# Leave small indices un-normalized
length[small_idx] = 1.0
Snorm[:] = S / length
elif fill:
# If we have a non-zero fill value, we locate those entries by
# doing a nan-divide.
# If S was finite, then length is finite (except for small positions)
length[small_idx] = np.nan
Snorm[:] = S / length
Snorm[np.isnan(Snorm)] = fill_norm
else:
# Set small values to zero by doing an inf-divide.
# This is safe (by IEEE-754) as long as S is finite.
length[small_idx] = np.inf
Snorm[:] = S / length
return Snorm
def tiny(x):
# Make sure we have an array view
x = np.asarray(x)
# Only floating types generate a tiny
if np.issubdtype(x.dtype, np.floating) or np.issubdtype(
x.dtype, np.complexfloating
):
dtype = x.dtype
else:
dtype = np.float32
return np.finfo(dtype).tiny
# amplitude_to_db
def amplitude_to_db(S, ref=1.0, amin=1e-5, top_db=80.0):
S = np.asarray(S)
if np.issubdtype(S.dtype, np.complexfloating):
warnings.warn(
'amplitude_to_db was called on complex input so phase '
'information will be discarded. To suppress this warning, '
'call amplitude_to_db(np.abs(S)) instead.'
)
magnitude = np.abs(S)
if callable(ref):
# User supplied a function to calculate reference power
ref_value = ref(magnitude)
else:
ref_value = np.abs(ref)
power = np.square(magnitude, out=magnitude)
return power_to_db(power, ref=ref_value ** 2, amin=amin ** 2, top_db=top_db)
def power_to_db(S, ref=1.0, amin=1e-10, top_db=80.0):
S = np.asarray(S)
if amin <= 0:
raise Exception('amin must be strictly positive')
if np.issubdtype(S.dtype, np.complexfloating):
warnings.warn(
'power_to_db was called on complex input so phase '
'information will be discarded. To suppress this warning, '
'call power_to_db(np.abs(D)**2) instead.'
)
magnitude = np.abs(S)
else:
magnitude = S
if callable(ref):
# User supplied a function to calculate reference power
ref_value = ref(magnitude)
else:
ref_value = np.abs(ref)
log_spec = 10.0 * np.log10(np.maximum(amin, magnitude))
log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value))
if top_db is not None:
if top_db < 0:
raise Exception('top_db must be non-negative')
log_spec = np.maximum(log_spec, log_spec.max() - top_db)
return log_spec
# onset_strength
def onset_strength(
y=None,
sr=22050,
S=None,
lag=1,
max_size=1,
ref=None,
detrend=False,
center=True,
feature=None,
aggregate=None,
**kwargs,
):
if aggregate is False:
raise Exception(
'aggregate={} cannot be False when computing full-spectrum onset strength.'
)
odf_all = onset_strength_multi(
y=y,
sr=sr,
S=S,
lag=lag,
max_size=max_size,
ref=ref,
detrend=detrend,
center=center,
feature=feature,
aggregate=aggregate,
channels=None,
**kwargs,
)
return odf_all[0]
def onset_strength_multi(
y=None,
sr=22050,
S=None,
n_fft=2048,
hop_length=512,
lag=1,
max_size=1,
ref=None,
detrend=False,
center=True,
feature=None,
aggregate=None,
channels=None,
**kwargs,
):
if feature is None:
feature = melspectrogram
kwargs.setdefault('fmax', 11025.0)
if aggregate is None:
aggregate = np.mean
if lag < 1 or not isinstance(lag, (int, np.integer)):
raise Exception('lag must be a positive integer')
if max_size < 1 or not isinstance(max_size, (int, np.integer)):
raise Exception('max_size must be a positive integer')
# First, compute mel spectrogram
if S is None:
S = np.abs(feature(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, **kwargs))
# Convert to dBs
S = power_to_db(S)
# Ensure that S is at least 2-d
S = np.atleast_2d(S)
# Compute the reference spectrogram.
# Efficiency hack: skip filtering step and pass by reference
# if max_size will produce a no-op.
if ref is None:
if max_size == 1:
ref = S
else:
ref = scipy.ndimage.maximum_filter1d(S, max_size, axis=0)
elif ref.shape != S.shape:
raise Exception(
'Reference spectrum shape {} must match input spectrum {}'.format(
ref.shape, S.shape
)
)
# Compute difference to the reference, spaced by lag
onset_env = S[:, lag:] - ref[:, :-lag]
# Discard negatives (decreasing amplitude)
onset_env = np.maximum(0.0, onset_env)
# Aggregate within channels
pad = True
if channels is None:
channels = [slice(None)]
else:
pad = False
if aggregate:
onset_env = sync(onset_env, channels, aggregate=aggregate, pad=pad, axis=0)
# compensate for lag
pad_width = lag
if center:
# Counter-act framing effects. Shift the onsets by n_fft / hop_length
pad_width += n_fft // (2 * hop_length)
onset_env = np.pad(onset_env, ([0, 0], [int(pad_width), 0]), mode='constant')
# remove the DC component
if detrend:
onset_env = scipy.signal.lfilter([1.0, -1.0], [1.0, -0.99], onset_env, axis=-1)
# Trim to match the input duration
if center:
onset_env = onset_env[:, : S.shape[1]]
return onset_env
def sync(data, idx, aggregate=None, pad=True, axis=-1):
if aggregate is None:
aggregate = np.mean
shape = list(data.shape)
if np.all([isinstance(_, slice) for _ in idx]):
slices = idx
elif np.all([np.issubdtype(type(_), np.integer) for _ in idx]):
slices = index_to_slice(np.asarray(idx), 0, shape[axis], pad=pad)
else:
raise Exception('Invalid index set: {}'.format(idx))
agg_shape = list(shape)
agg_shape[axis] = len(slices)
data_agg = np.empty(
agg_shape, order='F' if np.isfortran(data) else 'C', dtype=data.dtype
)
idx_in = [slice(None)] * data.ndim
idx_agg = [slice(None)] * data_agg.ndim
for (i, segment) in enumerate(slices):
idx_in[axis] = segment
idx_agg[axis] = i
data_agg[tuple(idx_agg)] = aggregate(data[tuple(idx_in)], axis=axis)
return data_agg
def index_to_slice(idx, idx_min=None, idx_max=None, step=None, pad=True):
# First, normalize the index set
idx_fixed = fix_frames(idx, idx_min, idx_max, pad=pad)
# Now convert the indices to slices
return [slice(start, end, step) for (start, end) in zip(idx_fixed, idx_fixed[1:])]
def fix_frames(frames, x_min=0, x_max=None, pad=True):
frames = np.asarray(frames)
if np.any(frames < 0):
raise Exception('Negative frame index detected')
if pad and (x_min is not None or x_max is not None):
frames = np.clip(frames, x_min, x_max)
if pad:
pad_data = []
if x_min is not None:
pad_data.append(x_min)
if x_max is not None:
pad_data.append(x_max)
frames = np.concatenate((pad_data, frames))
if x_min is not None:
frames = frames[frames >= x_min]
if x_max is not None:
frames = frames[frames <= x_max]
return np.unique(frames).astype(int)
def spectral_contrast(
y=None,
sr=22050,
S=None,
n_fft=2048,
hop_length=512,
win_length=None,
window='hann',
center=True,
pad_mode='reflect',
freq=None,
fmin=200.0,
n_bands=6,
quantile=0.02,
linear=False,
):
S, n_fft = _spectrogram(
y=y,
S=S,
n_fft=n_fft,
hop_length=hop_length,
win_length=win_length,
window=window,
center=center,
pad_mode=pad_mode,
)
# Compute the center frequencies of each bin
if freq is None:
freq = fft_frequencies(sr=sr, n_fft=n_fft)
freq = np.atleast_1d(freq)
if freq.ndim != 1 or len(freq) != S.shape[0]:
raise Exception('freq.shape mismatch: expected ' '({:d},)'.format(S.shape[0]))
if n_bands < 1 or not isinstance(n_bands, int):
raise Exception('n_bands must be a positive integer')
if not 0.0 < quantile < 1.0:
raise Exception('quantile must lie in the range (0, 1)')
if fmin <= 0:
raise Exception('fmin must be a positive number')
octa = np.zeros(n_bands + 2)
octa[1:] = fmin * (2.0 ** np.arange(0, n_bands + 1))
if np.any(octa[:-1] >= 0.5 * sr):
raise Exception(
'Frequency band exceeds Nyquist. ' 'Reduce either fmin or n_bands.'
)
valley = np.zeros((n_bands + 1, S.shape[1]))
peak = np.zeros_like(valley)
for k, (f_low, f_high) in enumerate(zip(octa[:-1], octa[1:])):
current_band = np.logical_and(freq >= f_low, freq <= f_high)
idx = np.flatnonzero(current_band)
if k > 0:
current_band[idx[0] - 1] = True
if k == n_bands:
current_band[idx[-1] + 1 :] = True
sub_band = S[current_band]
if k < n_bands:
sub_band = sub_band[:-1]
# Always take at least one bin from each side
idx = np.rint(quantile * np.sum(current_band))
idx = int(np.maximum(idx, 1))
sortedr = np.sort(sub_band, axis=0)
valley[k] = np.mean(sortedr[:idx], axis=0)
peak[k] = np.mean(sortedr[-idx:], axis=0)
if linear:
return peak - valley
else:
return power_to_db(peak) - power_to_db(valley)
# db_to_amplitude
def db_to_amplitude(S_db, ref=1.0):
return db_to_power(S_db, ref=ref ** 2) ** 0.5
def db_to_power(S_db, ref=1.0):
return ref * np.power(10.0, 0.1 * S_db)
# spectral_centroid
def spectral_centroid(
y=None,
sr=22050,
S=None,
n_fft=2048,
hop_length=512,
freq=None,
win_length=None,
window='hann',
center=True,
pad_mode='reflect',
):
S, n_fft = _spectrogram(
y=y,
S=S,
n_fft=n_fft,
hop_length=hop_length,
win_length=win_length,
window=window,
center=center,
pad_mode=pad_mode,
)
if not np.isrealobj(S):
raise Exception('Spectral centroid is only defined ' 'with real-valued input')
elif np.any(S < 0):
raise Exception(
'Spectral centroid is only defined ' 'with non-negative energies'
)
# Compute the center frequencies of each bin
if freq is None:
freq = fft_frequencies(sr=sr, n_fft=n_fft)
if freq.ndim == 1:
freq = freq.reshape((-1, 1))
# Column-normalize S
return np.sum(freq * normalize(S, norm=1, axis=0), axis=0, keepdims=True)