|
|
@ -53,16 +53,41 @@ class Vggish(NNOperator): |
|
|
|
self.model.eval() |
|
|
|
self.model.to(self.device) |
|
|
|
|
|
|
|
def __call__(self, datas: List[NamedTuple('data', [('audio', 'ndarray'), ('sample_rate', 'int')])]) -> numpy.ndarray: |
|
|
|
audios = numpy.hstack([item.audio for item in datas]) |
|
|
|
def __call__(self, |
|
|
|
datas: List[NamedTuple('data', [('audio', 'ndarray'), ('sample_rate', 'int'), ('layout', 'str')])]): |
|
|
|
audios = [item.audio for item in datas] |
|
|
|
sr = datas[0].sample_rate |
|
|
|
audio_array = numpy.reshape(audios, (-1, 1)) |
|
|
|
audio_tensors = self.preprocess(audio_array, sr).to(self.device) |
|
|
|
layout = datas[0].layout |
|
|
|
audio_tensors = self.preprocess(audios, sr, layout).to(self.device) |
|
|
|
features = self.model(audio_tensors) |
|
|
|
outs = features.to("cpu") |
|
|
|
return [AudioOutput(outs.detach().numpy())] |
|
|
|
|
|
|
|
def preprocess(self, audio: numpy.ndarray, sr: int = None): |
|
|
|
ii = numpy.iinfo(audio.dtype) |
|
|
|
samples = 2 * audio / (ii.max - ii.min + 1) |
|
|
|
return vggish_input.waveform_to_examples(samples, sr, return_tensor=True) |
|
|
|
def preprocess(self, frames: List[numpy.ndarray], sr, layout): |
|
|
|
audio = numpy.hstack(frames) |
|
|
|
if layout == 'stereo': |
|
|
|
audio = audio.reshape(-1, 2) |
|
|
|
audio = self.int2float(audio) |
|
|
|
try: |
|
|
|
audio = audio.transpose() |
|
|
|
audio_tensors = vggish_input.waveform_to_examples(audio, sr, return_tensor=True) |
|
|
|
return audio_tensors |
|
|
|
except Exception as e: |
|
|
|
log.error("Fail to load audio data.") |
|
|
|
raise e |
|
|
|
|
|
|
|
def int2float(self, wav: numpy.ndarray, dtype: str = 'float64'): |
|
|
|
""" |
|
|
|
Convert audio data from int to float. |
|
|
|
The input dtype must be integers. |
|
|
|
The output dtype is controlled by the parameter `dtype`, defaults to 'float64'. |
|
|
|
The code is inspired by https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py |
|
|
|
""" |
|
|
|
assert wav.dtype.kind in 'iu' |
|
|
|
dtype = numpy.dtype(dtype) |
|
|
|
assert dtype.kind == 'f' |
|
|
|
|
|
|
|
ii = numpy.iinfo(wav.dtype) |
|
|
|
abs_max = 2 ** (ii.bits - 1) |
|
|
|
offset = ii.min + abs_max |
|
|
|
return (wav.astype(dtype) - offset) / abs_max |
|
|
|