towhee
/
torch-vggish
copied
10 changed files with 575 additions and 2 deletions
@ -1,3 +1,41 @@ |
|||||
# torch-vggish |
|
||||
|
# VGGish Embedding Operator (Pytorch) |
||||
|
|
||||
This is another test repo |
|
||||
|
Authors: Jael Gu |
||||
|
|
||||
|
## Overview |
||||
|
|
||||
|
This operator uses reads the waveform of an audio file and then applies VGGish to extract features. The original VGGish model is built on top of Tensorflow.[1] This operator converts VGGish into **Pytorch**. It generates a set of vectors given an input. Each vector represents features of a non-overlapping clip with a fixed length of 0.96s and each clip is composed of 64 mel bands and 96 frames. The model is pre-trained with a large scale of audio dataset [AudioSet](https://research.google.com/audioset). As suggested, this model is suitable to extract features at high level or warm up a larger model. |
||||
|
|
||||
|
## Interface |
||||
|
|
||||
|
```python |
||||
|
__call__(self, filepath: str) |
||||
|
``` |
||||
|
|
||||
|
**Args:** |
||||
|
|
||||
|
- filepath: |
||||
|
- the input audio path |
||||
|
- supported types: str |
||||
|
|
||||
|
**Returns:** |
||||
|
|
||||
|
The Operator returns a tuple Tuple[('embs', numpy.ndarray)] containing following fields: |
||||
|
|
||||
|
- embs: |
||||
|
- embeddings of the audio |
||||
|
- data type: `numpy.ndarray` |
||||
|
- shape: (num_clips,128) |
||||
|
|
||||
|
## Requirements |
||||
|
|
||||
|
You can get the required python package by [requirements.txt](./requirements.txt). |
||||
|
|
||||
|
## How it works |
||||
|
|
||||
|
The `towhee/torch-vggish` Operator implements the function of audio embedding, which can be added to a towhee pipeline. For example, it is the key operator of the pipeline [audio-embedding-vggish](https://hub.towhee.io/towhee/audio-embedding-vggish). |
||||
|
|
||||
|
## Reference |
||||
|
|
||||
|
[1]. https://github.com/tensorflow/models/tree/master/research/audioset/vggish |
||||
|
[2]. https://tfhub.dev/google/vggish/1 |
||||
|
@ -0,0 +1,13 @@ |
|||||
|
# Copyright 2021 Zilliz. All rights reserved. |
||||
|
# |
||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
|
# you may not use this file except in compliance with the License. |
||||
|
# You may obtain a copy of the License at |
||||
|
# |
||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
# |
||||
|
# Unless required by applicable law or agreed to in writing, software |
||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
|
# See the License for the specific language governing permissions and |
||||
|
# limitations under the License. |
@ -0,0 +1,24 @@ |
|||||
|
# Copyright 2021 Zilliz. All rights reserved. |
||||
|
# |
||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
|
# you may not use this file except in compliance with the License. |
||||
|
# You may obtain a copy of the License at |
||||
|
# |
||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
# |
||||
|
# Unless required by applicable law or agreed to in writing, software |
||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
|
# See the License for the specific language governing permissions and |
||||
|
# limitations under the License. |
||||
|
|
||||
|
import os |
||||
|
|
||||
|
# For requirements. |
||||
|
try: |
||||
|
import timm |
||||
|
except ModuleNotFoundError: |
||||
|
os.system('pip install timm') |
||||
|
|
||||
|
from timm.data import resolve_data_config |
||||
|
from timm.data.transforms_factory import create_transform |
@ -0,0 +1,223 @@ |
|||||
|
# Copyright 2017 The TensorFlow Authors All Rights Reserved. |
||||
|
# |
||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
|
# you may not use this file except in compliance with the License. |
||||
|
# You may obtain a copy of the License at |
||||
|
# |
||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
# |
||||
|
# Unless required by applicable law or agreed to in writing, software |
||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
|
# See the License for the specific language governing permissions and |
||||
|
# limitations under the License. |
||||
|
# ============================================================================== |
||||
|
|
||||
|
"""Defines routines to compute mel spectrogram features from audio waveform.""" |
||||
|
|
||||
|
import numpy as np |
||||
|
|
||||
|
|
||||
|
def frame(data, window_length, hop_length): |
||||
|
"""Convert array into a sequence of successive possibly overlapping frames. |
||||
|
|
||||
|
An n-dimensional array of shape (num_samples, ...) is converted into an |
||||
|
(n+1)-D array of shape (num_frames, window_length, ...), where each frame |
||||
|
starts hop_length points after the preceding one. |
||||
|
|
||||
|
This is accomplished using stride_tricks, so the original data is not |
||||
|
copied. However, there is no zero-padding, so any incomplete frames at the |
||||
|
end are not included. |
||||
|
|
||||
|
Args: |
||||
|
data: np.array of dimension N >= 1. |
||||
|
window_length: Number of samples in each frame. |
||||
|
hop_length: Advance (in samples) between each window. |
||||
|
|
||||
|
Returns: |
||||
|
(N+1)-D np.array with as many rows as there are complete frames that can be |
||||
|
extracted. |
||||
|
""" |
||||
|
num_samples = data.shape[0] |
||||
|
num_frames = 1 + int(np.floor((num_samples - window_length) / hop_length)) |
||||
|
shape = (num_frames, window_length) + data.shape[1:] |
||||
|
strides = (data.strides[0] * hop_length,) + data.strides |
||||
|
return np.lib.stride_tricks.as_strided(data, shape=shape, strides=strides) |
||||
|
|
||||
|
|
||||
|
def periodic_hann(window_length): |
||||
|
"""Calculate a "periodic" Hann window. |
||||
|
|
||||
|
The classic Hann window is defined as a raised cosine that starts and |
||||
|
ends on zero, and where every value appears twice, except the middle |
||||
|
point for an odd-length window. Matlab calls this a "symmetric" window |
||||
|
and np.hanning() returns it. However, for Fourier analysis, this |
||||
|
actually represents just over one cycle of a period N-1 cosine, and |
||||
|
thus is not compactly expressed on a length-N Fourier basis. Instead, |
||||
|
it's better to use a raised cosine that ends just before the final |
||||
|
zero value - i.e. a complete cycle of a period-N cosine. Matlab |
||||
|
calls this a "periodic" window. This routine calculates it. |
||||
|
|
||||
|
Args: |
||||
|
window_length: The number of points in the returned window. |
||||
|
|
||||
|
Returns: |
||||
|
A 1D np.array containing the periodic hann window. |
||||
|
""" |
||||
|
return 0.5 - (0.5 * np.cos(2 * np.pi / window_length * |
||||
|
np.arange(window_length))) |
||||
|
|
||||
|
|
||||
|
def stft_magnitude(signal, fft_length, |
||||
|
hop_length=None, |
||||
|
window_length=None): |
||||
|
"""Calculate the short-time Fourier transform magnitude. |
||||
|
|
||||
|
Args: |
||||
|
signal: 1D np.array of the input time-domain signal. |
||||
|
fft_length: Size of the FFT to apply. |
||||
|
hop_length: Advance (in samples) between each frame passed to FFT. |
||||
|
window_length: Length of each block of samples to pass to FFT. |
||||
|
|
||||
|
Returns: |
||||
|
2D np.array where each row contains the magnitudes of the fft_length/2+1 |
||||
|
unique values of the FFT for the corresponding frame of input samples. |
||||
|
""" |
||||
|
frames = frame(signal, window_length, hop_length) |
||||
|
# Apply frame window to each frame. We use a periodic Hann (cosine of period |
||||
|
# window_length) instead of the symmetric Hann of np.hanning (period |
||||
|
# window_length-1). |
||||
|
window = periodic_hann(window_length) |
||||
|
windowed_frames = frames * window |
||||
|
return np.abs(np.fft.rfft(windowed_frames, int(fft_length))) |
||||
|
|
||||
|
|
||||
|
# Mel spectrum constants and functions. |
||||
|
_MEL_BREAK_FREQUENCY_HERTZ = 700.0 |
||||
|
_MEL_HIGH_FREQUENCY_Q = 1127.0 |
||||
|
|
||||
|
|
||||
|
def hertz_to_mel(frequencies_hertz): |
||||
|
"""Convert frequencies to mel scale using HTK formula. |
||||
|
|
||||
|
Args: |
||||
|
frequencies_hertz: Scalar or np.array of frequencies in hertz. |
||||
|
|
||||
|
Returns: |
||||
|
Object of same size as frequencies_hertz containing corresponding values |
||||
|
on the mel scale. |
||||
|
""" |
||||
|
return _MEL_HIGH_FREQUENCY_Q * np.log( |
||||
|
1.0 + (frequencies_hertz / _MEL_BREAK_FREQUENCY_HERTZ)) |
||||
|
|
||||
|
|
||||
|
def spectrogram_to_mel_matrix(num_mel_bins=20, |
||||
|
num_spectrogram_bins=129, |
||||
|
audio_sample_rate=8000, |
||||
|
lower_edge_hertz=125.0, |
||||
|
upper_edge_hertz=3800.0): |
||||
|
"""Return a matrix that can post-multiply spectrogram rows to make mel. |
||||
|
|
||||
|
Returns a np.array matrix A that can be used to post-multiply a matrix S of |
||||
|
spectrogram values (STFT magnitudes) arranged as frames x bins to generate a |
||||
|
"mel spectrogram" M of frames x num_mel_bins. M = S A. |
||||
|
|
||||
|
The classic HTK algorithm exploits the complementarity of adjacent mel bands |
||||
|
to multiply each FFT bin by only one mel weight, then add it, with positive |
||||
|
and negative signs, to the two adjacent mel bands to which that bin |
||||
|
contributes. Here, by expressing this operation as a matrix multiply, we go |
||||
|
from num_fft multiplies per frame (plus around 2*num_fft adds) to around |
||||
|
num_fft^2 multiplies and adds. However, because these are all presumably |
||||
|
accomplished in a single call to np.dot(), it's not clear which approach is |
||||
|
faster in Python. The matrix multiplication has the attraction of being more |
||||
|
general and flexible, and much easier to read. |
||||
|
|
||||
|
Args: |
||||
|
num_mel_bins: How many bands in the resulting mel spectrum. This is |
||||
|
the number of columns in the output matrix. |
||||
|
num_spectrogram_bins: How many bins there are in the source spectrogram |
||||
|
data, which is understood to be fft_size/2 + 1, i.e. the spectrogram |
||||
|
only contains the nonredundant FFT bins. |
||||
|
audio_sample_rate: Samples per second of the audio at the input to the |
||||
|
spectrogram. We need this to figure out the actual frequencies for |
||||
|
each spectrogram bin, which dictates how they are mapped into mel. |
||||
|
lower_edge_hertz: Lower bound on the frequencies to be included in the mel |
||||
|
spectrum. This corresponds to the lower edge of the lowest triangular |
||||
|
band. |
||||
|
upper_edge_hertz: The desired top edge of the highest frequency band. |
||||
|
|
||||
|
Returns: |
||||
|
An np.array with shape (num_spectrogram_bins, num_mel_bins). |
||||
|
|
||||
|
Raises: |
||||
|
ValueError: if frequency edges are incorrectly ordered or out of range. |
||||
|
""" |
||||
|
nyquist_hertz = audio_sample_rate / 2. |
||||
|
if lower_edge_hertz < 0.0: |
||||
|
raise ValueError("lower_edge_hertz %.1f must be >= 0" % lower_edge_hertz) |
||||
|
if lower_edge_hertz >= upper_edge_hertz: |
||||
|
raise ValueError("lower_edge_hertz %.1f >= upper_edge_hertz %.1f" % |
||||
|
(lower_edge_hertz, upper_edge_hertz)) |
||||
|
if upper_edge_hertz > nyquist_hertz: |
||||
|
raise ValueError("upper_edge_hertz %.1f is greater than Nyquist %.1f" % |
||||
|
(upper_edge_hertz, nyquist_hertz)) |
||||
|
spectrogram_bins_hertz = np.linspace(0.0, nyquist_hertz, num_spectrogram_bins) |
||||
|
spectrogram_bins_mel = hertz_to_mel(spectrogram_bins_hertz) |
||||
|
# The i'th mel band (starting from i=1) has center frequency |
||||
|
# band_edges_mel[i], lower edge band_edges_mel[i-1], and higher edge |
||||
|
# band_edges_mel[i+1]. Thus, we need num_mel_bins + 2 values in |
||||
|
# the band_edges_mel arrays. |
||||
|
band_edges_mel = np.linspace(hertz_to_mel(lower_edge_hertz), |
||||
|
hertz_to_mel(upper_edge_hertz), num_mel_bins + 2) |
||||
|
# Matrix to post-multiply feature arrays whose rows are num_spectrogram_bins |
||||
|
# of spectrogram values. |
||||
|
mel_weights_matrix = np.empty((num_spectrogram_bins, num_mel_bins)) |
||||
|
for i in range(num_mel_bins): |
||||
|
lower_edge_mel, center_mel, upper_edge_mel = band_edges_mel[i:i + 3] |
||||
|
# Calculate lower and upper slopes for every spectrogram bin. |
||||
|
# Line segments are linear in the *mel* domain, not hertz. |
||||
|
lower_slope = ((spectrogram_bins_mel - lower_edge_mel) / |
||||
|
(center_mel - lower_edge_mel)) |
||||
|
upper_slope = ((upper_edge_mel - spectrogram_bins_mel) / |
||||
|
(upper_edge_mel - center_mel)) |
||||
|
# .. then intersect them with each other and zero. |
||||
|
mel_weights_matrix[:, i] = np.maximum(0.0, np.minimum(lower_slope, |
||||
|
upper_slope)) |
||||
|
# HTK excludes the spectrogram DC bin; make sure it always gets a zero |
||||
|
# coefficient. |
||||
|
mel_weights_matrix[0, :] = 0.0 |
||||
|
return mel_weights_matrix |
||||
|
|
||||
|
|
||||
|
def log_mel_spectrogram(data, |
||||
|
audio_sample_rate=8000, |
||||
|
log_offset=0.0, |
||||
|
window_length_secs=0.025, |
||||
|
hop_length_secs=0.010, |
||||
|
**kwargs): |
||||
|
"""Convert waveform to a log magnitude mel-frequency spectrogram. |
||||
|
|
||||
|
Args: |
||||
|
data: 1D np.array of waveform data. |
||||
|
audio_sample_rate: The sampling rate of data. |
||||
|
log_offset: Add this to values when taking log to avoid -Infs. |
||||
|
window_length_secs: Duration of each window to analyze. |
||||
|
hop_length_secs: Advance between successive analysis windows. |
||||
|
**kwargs: Additional arguments to pass to spectrogram_to_mel_matrix. |
||||
|
|
||||
|
Returns: |
||||
|
2D np.array of (num_frames, num_mel_bins) consisting of log mel filterbank |
||||
|
magnitudes for successive frames. |
||||
|
""" |
||||
|
window_length_samples = int(round(audio_sample_rate * window_length_secs)) |
||||
|
hop_length_samples = int(round(audio_sample_rate * hop_length_secs)) |
||||
|
fft_length = 2 ** int(np.ceil(np.log(window_length_samples) / np.log(2.0))) |
||||
|
spectrogram = stft_magnitude( |
||||
|
data, |
||||
|
fft_length=fft_length, |
||||
|
hop_length=hop_length_samples, |
||||
|
window_length=window_length_samples) |
||||
|
mel_spectrogram = np.dot(spectrogram, spectrogram_to_mel_matrix( |
||||
|
num_spectrogram_bins=spectrogram.shape[1], |
||||
|
audio_sample_rate=audio_sample_rate, **kwargs)) |
||||
|
return np.log(mel_spectrogram + log_offset) |
@ -0,0 +1,71 @@ |
|||||
|
# Copyright 2021 Zilliz. All rights reserved. |
||||
|
# |
||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
|
# you may not use this file except in compliance with the License. |
||||
|
# You may obtain a copy of the License at |
||||
|
# |
||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
# |
||||
|
# Unless required by applicable law or agreed to in writing, software |
||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
|
# See the License for the specific language governing permissions and |
||||
|
# limitations under the License. |
||||
|
|
||||
|
import torch |
||||
|
import torch.nn as nn |
||||
|
import numpy as np |
||||
|
import sys |
||||
|
from pathlib import Path |
||||
|
|
||||
|
sys.path.append(str(Path(__file__).parent)) |
||||
|
|
||||
|
import vggish_input |
||||
|
|
||||
|
class Model(nn.Module): |
||||
|
""" |
||||
|
PyTorch model class |
||||
|
""" |
||||
|
def __init__(self): |
||||
|
super().__init__() |
||||
|
self.features = nn.Sequential( |
||||
|
nn.Conv2d(1, 64, 3, 1, 1), |
||||
|
nn.ReLU(inplace=True), |
||||
|
nn.MaxPool2d(2, 2), |
||||
|
nn.Conv2d(64, 128, 3, 1, 1), |
||||
|
nn.ReLU(inplace=True), |
||||
|
nn.MaxPool2d(2, 2), |
||||
|
nn.Conv2d(128, 256, 3, 1, 1), |
||||
|
nn.ReLU(inplace=True), |
||||
|
nn.Conv2d(256, 256, 3, 1, 1), |
||||
|
nn.ReLU(inplace=True), |
||||
|
nn.MaxPool2d(2, 2), |
||||
|
nn.Conv2d(256, 512, 3, 1, 1), |
||||
|
nn.ReLU(inplace=True), |
||||
|
nn.Conv2d(512, 512, 3, 1, 1), |
||||
|
nn.ReLU(inplace=True), |
||||
|
nn.MaxPool2d(2, 2)) |
||||
|
self.embeddings = nn.Sequential( |
||||
|
nn.Linear(512 * 24, 4096), |
||||
|
nn.ReLU(inplace=True), |
||||
|
nn.Linear(4096, 4096), |
||||
|
nn.ReLU(inplace=True), |
||||
|
nn.Linear(4096, 128), |
||||
|
#nn.ReLU(inplace=True) |
||||
|
) |
||||
|
|
||||
|
def forward(self, x): |
||||
|
x = self.features(x).permute(0, 2, 3, 1).contiguous() |
||||
|
x = x.view(x.size(0), -1) |
||||
|
x = self.embeddings(x) |
||||
|
return x |
||||
|
|
||||
|
def preprocess(self, audio_path: str): |
||||
|
audio_tensors = vggish_input.wavfile_to_examples(audio_path) |
||||
|
return audio_tensors |
||||
|
|
||||
|
def train(self): |
||||
|
""" |
||||
|
For training model |
||||
|
""" |
||||
|
pass |
Binary file not shown.
@ -0,0 +1,98 @@ |
|||||
|
# Copyright 2017 The TensorFlow Authors All Rights Reserved. |
||||
|
# |
||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
|
# you may not use this file except in compliance with the License. |
||||
|
# You may obtain a copy of the License at |
||||
|
# |
||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
# |
||||
|
# Unless required by applicable law or agreed to in writing, software |
||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
|
# See the License for the specific language governing permissions and |
||||
|
# limitations under the License. |
||||
|
# ============================================================================== |
||||
|
|
||||
|
"""Compute input examples for VGGish from audio waveform.""" |
||||
|
|
||||
|
# Modification: Return torch tensors rather than numpy arrays |
||||
|
import torch |
||||
|
|
||||
|
import numpy as np |
||||
|
import resampy |
||||
|
|
||||
|
import mel_features |
||||
|
import vggish_params |
||||
|
|
||||
|
import soundfile as sf |
||||
|
|
||||
|
|
||||
|
def waveform_to_examples(data, sample_rate, return_tensor=True): |
||||
|
"""Converts audio waveform into an array of examples for VGGish. |
||||
|
|
||||
|
Args: |
||||
|
data: np.array of either one dimension (mono) or two dimensions |
||||
|
(multi-channel, with the outer dimension representing channels). |
||||
|
Each sample is generally expected to lie in the range [-1.0, +1.0], |
||||
|
although this is not required. |
||||
|
sample_rate: Sample rate of data. |
||||
|
return_tensor: Return data as a Pytorch tensor ready for VGGish |
||||
|
|
||||
|
Returns: |
||||
|
3-D np.array of shape [num_examples, num_frames, num_bands] which represents |
||||
|
a sequence of examples, each of which contains a patch of log mel |
||||
|
spectrogram, covering num_frames frames of audio and num_bands mel frequency |
||||
|
bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS. |
||||
|
|
||||
|
""" |
||||
|
# Convert to mono. |
||||
|
if len(data.shape) > 1: |
||||
|
data = np.mean(data, axis=1) |
||||
|
# Resample to the rate assumed by VGGish. |
||||
|
if sample_rate != vggish_params.SAMPLE_RATE: |
||||
|
data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE) |
||||
|
|
||||
|
# Compute log mel spectrogram features. |
||||
|
log_mel = mel_features.log_mel_spectrogram( |
||||
|
data, |
||||
|
audio_sample_rate=vggish_params.SAMPLE_RATE, |
||||
|
log_offset=vggish_params.LOG_OFFSET, |
||||
|
window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS, |
||||
|
hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS, |
||||
|
num_mel_bins=vggish_params.NUM_MEL_BINS, |
||||
|
lower_edge_hertz=vggish_params.MEL_MIN_HZ, |
||||
|
upper_edge_hertz=vggish_params.MEL_MAX_HZ) |
||||
|
|
||||
|
# Frame features into examples. |
||||
|
features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS |
||||
|
example_window_length = int(round( |
||||
|
vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) |
||||
|
example_hop_length = int(round( |
||||
|
vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate)) |
||||
|
log_mel_examples = mel_features.frame( |
||||
|
log_mel, |
||||
|
window_length=example_window_length, |
||||
|
hop_length=example_hop_length) |
||||
|
|
||||
|
if return_tensor: |
||||
|
log_mel_examples = torch.tensor( |
||||
|
log_mel_examples, requires_grad=True)[:, None, :, :].float() |
||||
|
|
||||
|
return log_mel_examples |
||||
|
|
||||
|
|
||||
|
def wavfile_to_examples(wav_file, return_tensor=True): |
||||
|
"""Convenience wrapper around waveform_to_examples() for a common WAV format. |
||||
|
|
||||
|
Args: |
||||
|
wav_file: String path to a file, or a file-like object. The file |
||||
|
is assumed to contain WAV audio data with signed 16-bit PCM samples. |
||||
|
torch: Return data as a Pytorch tensor ready for VGGish |
||||
|
|
||||
|
Returns: |
||||
|
See waveform_to_examples. |
||||
|
""" |
||||
|
wav_data, sr = sf.read(wav_file, dtype='int16') |
||||
|
assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype |
||||
|
samples = wav_data / 32768.0 # Convert to [-1.0, +1.0] |
||||
|
return waveform_to_examples(samples, sr, return_tensor) |
@ -0,0 +1,53 @@ |
|||||
|
# Copyright 2017 The TensorFlow Authors All Rights Reserved. |
||||
|
# |
||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
|
# you may not use this file except in compliance with the License. |
||||
|
# You may obtain a copy of the License at |
||||
|
# |
||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
# |
||||
|
# Unless required by applicable law or agreed to in writing, software |
||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
|
# See the License for the specific language governing permissions and |
||||
|
# limitations under the License. |
||||
|
# ============================================================================== |
||||
|
|
||||
|
"""Global parameters for the VGGish model. |
||||
|
|
||||
|
See vggish_slim.py for more information. |
||||
|
""" |
||||
|
|
||||
|
# Architectural constants. |
||||
|
NUM_FRAMES = 96 # Frames in input mel-spectrogram patch. |
||||
|
NUM_BANDS = 64 # Frequency bands in input mel-spectrogram patch. |
||||
|
EMBEDDING_SIZE = 128 # Size of embedding layer. |
||||
|
|
||||
|
# Hyperparameters used in feature and example generation. |
||||
|
SAMPLE_RATE = 16000 |
||||
|
STFT_WINDOW_LENGTH_SECONDS = 0.025 |
||||
|
STFT_HOP_LENGTH_SECONDS = 0.010 |
||||
|
NUM_MEL_BINS = NUM_BANDS |
||||
|
MEL_MIN_HZ = 125 |
||||
|
MEL_MAX_HZ = 7500 |
||||
|
LOG_OFFSET = 0.01 # Offset used for stabilized log of input mel-spectrogram. |
||||
|
EXAMPLE_WINDOW_SECONDS = 0.96 # Each example contains 96 10ms frames |
||||
|
EXAMPLE_HOP_SECONDS = 0.96 # with zero overlap. |
||||
|
|
||||
|
# Parameters used for embedding postprocessing. |
||||
|
PCA_EIGEN_VECTORS_NAME = 'pca_eigen_vectors' |
||||
|
PCA_MEANS_NAME = 'pca_means' |
||||
|
QUANTIZE_MIN_VAL = -2.0 |
||||
|
QUANTIZE_MAX_VAL = +2.0 |
||||
|
|
||||
|
# Hyperparameters used in training. |
||||
|
INIT_STDDEV = 0.01 # Standard deviation used to initialize weights. |
||||
|
LEARNING_RATE = 1e-4 # Learning rate for the Adam optimizer. |
||||
|
ADAM_EPSILON = 1e-8 # Epsilon for the Adam optimizer. |
||||
|
|
||||
|
# Names of ops, tensors, and features. |
||||
|
INPUT_OP_NAME = 'vggish/input_features' |
||||
|
INPUT_TENSOR_NAME = INPUT_OP_NAME + ':0' |
||||
|
OUTPUT_OP_NAME = 'vggish/embedding' |
||||
|
OUTPUT_TENSOR_NAME = OUTPUT_OP_NAME + ':0' |
||||
|
AUDIO_EMBEDDING_FEATURE_NAME = 'audio_embedding' |
@ -0,0 +1,3 @@ |
|||||
|
torch==1.9.0 |
||||
|
numpy==1.19.5 |
||||
|
soundfile |
@ -0,0 +1,50 @@ |
|||||
|
# Copyright 2021 Zilliz. All rights reserved. |
||||
|
# |
||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
|
# you may not use this file except in compliance with the License. |
||||
|
# You may obtain a copy of the License at |
||||
|
# |
||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
# |
||||
|
# Unless required by applicable law or agreed to in writing, software |
||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
|
# See the License for the specific language governing permissions and |
||||
|
# limitations under the License. |
||||
|
|
||||
|
|
||||
|
import sys |
||||
|
import torch |
||||
|
from typing import NamedTuple |
||||
|
from pathlib import Path |
||||
|
import numpy |
||||
|
import os |
||||
|
|
||||
|
from towhee.operator import Operator |
||||
|
|
||||
|
import warnings |
||||
|
warnings.filterwarnings("ignore") |
||||
|
|
||||
|
class TorchVggish(Operator): |
||||
|
""" |
||||
|
""" |
||||
|
|
||||
|
def __init__(self, framework: str = 'pytorch') -> None: |
||||
|
super().__init__() |
||||
|
if framework == 'pytorch': |
||||
|
import importlib.util |
||||
|
path = os.path.join(str(Path(__file__).parent), 'pytorch', 'model.py') |
||||
|
opname = os.path.basename(str(Path(__file__))).split('.')[0] |
||||
|
spec = importlib.util.spec_from_file_location(opname, path) |
||||
|
module = importlib.util.module_from_spec(spec) |
||||
|
spec.loader.exec_module(module) |
||||
|
self.model = module.Model() |
||||
|
|
||||
|
path = str(Path(__file__).parent) |
||||
|
self.model.load_state_dict(torch.load(path + '/pytorch/vggish.pth', map_location=torch.device('cpu'))) |
||||
|
|
||||
|
def __call__(self, audio_path: str) -> NamedTuple('Outputs', [('embs', numpy.ndarray)]): |
||||
|
audio_tensors = self.model.preprocess(audio_path) |
||||
|
features = self.model.forward(audio_tensors) |
||||
|
Outputs = NamedTuple('Outputs', [('embs', numpy.ndarray)]) |
||||
|
return Outputs(features.detach().numpy()) |
Loading…
Reference in new issue