Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import numpy as np
import warnings
import utils
# Constant definition
FS = 10000 # Sampling frequency
N_FRAME = 256 # Window support
NFFT = 512 # FFT Size
NUMBAND = 15 # Number of 13 octave band
MINFREQ = 150 # Center frequency of 1st octave band (Hz)
OBM, CF = utils.thirdoct(FS, NFFT, NUMBAND, MINFREQ) # Get 1/3 octave band matrix
N = 30 # N. frames for intermediate intelligibility
BETA = -15. # Lower SDR bound
DYN_RANGE = 40 # Speech dynamic range
def stoi(x, y, fs_sig, extended=False):
""" Short term objective intelligibility
Computes the STOI (See [1][2]) of a denoised signal compared to a clean
signal, The output is expected to have a monotonic relation with the
subjective speech-intelligibility, where a higher score denotes better
speech intelligibility.
# Arguments
x (np.ndarray): clean original speech
y (np.ndarray): denoised speech
fs_sig (int): sampling rate of x and y
extended (bool): Boolean, whether to use the extended STOI described in [3]
# Returns
float: Short time objective intelligibility measure between clean and
denoised speech
# Raises
AssertionError : if x and y have different lengths
# Reference
[1] C.H.Taal, R.C.Hendriks, R.Heusdens, J.Jensen 'A Short-Time
Objective Intelligibility Measure for Time-Frequency Weighted Noisy
Speech', ICASSP 2010, Texas, Dallas.
[2] C.H.Taal, R.C.Hendriks, R.Heusdens, J.Jensen 'An Algorithm for
Intelligibility Prediction of Time-Frequency Weighted Noisy Speech',
IEEE Transactions on Audio, Speech, and Language Processing, 2011.
[3] Jesper Jensen and Cees H. Taal, 'An Algorithm for Predicting the
Intelligibility of Speech Masked by Modulated Noise Maskers',
IEEE Transactions on Audio, Speech and Language Processing, 2016.
"""
if x.shape != y.shape:
raise Exception('x and y should have the same length,' +
'found {} and {}'.format(x.shape, y.shape))
# Resample is fs_sig is different than fs
if fs_sig != FS:
x = utils.resample_oct(x, FS, fs_sig)
y = utils.resample_oct(y, FS, fs_sig)
# Remove silent frames
x, y = utils.remove_silent_frames(x, y, DYN_RANGE, N_FRAME, int(N_FRAME/2))
# Take STFT
x_spec = utils.stft(x, N_FRAME, NFFT, overlap=2).transpose()
y_spec = utils.stft(y, N_FRAME, NFFT, overlap=2).transpose()
# Ensure at least 30 frames for intermediate intelligibility
if x_spec.shape[-1] < N:
warnings.warn('Not enough STFT frames to compute intermediate '
'intelligibility measure after removing silent '
'frames. Returning 1e-5. Please check you wav files',
RuntimeWarning)
return 1e-5
# Apply OB matrix to the spectrograms as in Eq. (1)
x_tob = np.sqrt(np.matmul(OBM, np.square(np.abs(x_spec))))
y_tob = np.sqrt(np.matmul(OBM, np.square(np.abs(y_spec))))
# Take segments of x_tob, y_tob
x_segments = np.array(
[x_tob[:, m - N:m] for m in range(N, x_tob.shape[1] + 1)])
y_segments = np.array(
[y_tob[:, m - N:m] for m in range(N, x_tob.shape[1] + 1)])
if extended:
x_n = utils.row_col_normalize(x_segments)
y_n = utils.row_col_normalize(y_segments)
return np.sum(x_n * y_n / N) / x_n.shape[0]
else:
# Find normalization constants and normalize
normalization_consts = (
np.linalg.norm(x_segments, axis=2, keepdims=True) /
(np.linalg.norm(y_segments, axis=2, keepdims=True) + utils.EPS))
y_segments_normalized = y_segments * normalization_consts
# Clip as described in [1]
clip_value = 10 ** (-BETA / 20)
y_primes = np.minimum(
y_segments_normalized, x_segments * (1 + clip_value))
# Subtract mean vectors
y_primes = y_primes - np.mean(y_primes, axis=2, keepdims=True)
x_segments = x_segments - np.mean(x_segments, axis=2, keepdims=True)
# Divide by their norms
y_primes /= (np.linalg.norm(y_primes, axis=2, keepdims=True) + utils.EPS)
x_segments /= (np.linalg.norm(x_segments, axis=2, keepdims=True) + utils.EPS)
# Find a matrix with entries summing to sum of correlations of vectors
correlations_components = y_primes * x_segments
# J, M as in [1], eq.6
J = x_segments.shape[0]
M = x_segments.shape[1]
# Find the mean of all correlations
d = np.sum(correlations_components) / (J * M)
return d