| 
						
						
							
								
							
						
						
					 | 
				
				 | 
				
					@ -53,16 +53,41 @@ class Vggish(NNOperator): | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        self.model.eval() | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        self.model.to(self.device) | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					
 | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					    def __call__(self, datas: List[NamedTuple('data', [('audio', 'ndarray'), ('sample_rate', 'int')])]) -> numpy.ndarray: | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        audios = numpy.hstack([item.audio for item in datas]) | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					    def __call__(self, | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					                 datas: List[NamedTuple('data', [('audio', 'ndarray'), ('sample_rate', 'int'), ('layout', 'str')])]): | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        audios = [item.audio for item in datas] | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        sr = datas[0].sample_rate | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        audio_array = numpy.reshape(audios, (-1, 1)) | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        audio_tensors = self.preprocess(audio_array, sr).to(self.device) | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        layout = datas[0].layout | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        audio_tensors = self.preprocess(audios, sr, layout).to(self.device) | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        features = self.model(audio_tensors) | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        outs = features.to("cpu") | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        return [AudioOutput(outs.detach().numpy())] | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					
 | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					    def preprocess(self, audio: numpy.ndarray, sr: int = None): | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        ii = numpy.iinfo(audio.dtype) | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        samples = 2 * audio / (ii.max - ii.min + 1) | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        return vggish_input.waveform_to_examples(samples, sr, return_tensor=True) | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					    def preprocess(self, frames: List[numpy.ndarray], sr, layout): | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        audio = numpy.hstack(frames) | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        if layout == 'stereo': | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					            audio = audio.reshape(-1, 2) | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        audio = self.int2float(audio) | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        try: | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					            audio = audio.transpose() | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					            audio_tensors = vggish_input.waveform_to_examples(audio, sr, return_tensor=True) | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					            return audio_tensors | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        except Exception as e: | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					            log.error("Fail to load audio data.") | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					            raise e | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					
 | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					    def int2float(self, wav: numpy.ndarray, dtype: str = 'float64'): | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        """ | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        Convert audio data from int to float. | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        The input dtype must be integers. | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        The output dtype is controlled by the parameter `dtype`, defaults to 'float64'. | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        The code is inspired by https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        """ | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        assert wav.dtype.kind in 'iu' | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        dtype = numpy.dtype(dtype) | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        assert dtype.kind == 'f' | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					
 | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        ii = numpy.iinfo(wav.dtype) | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        abs_max = 2 ** (ii.bits - 1) | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        offset = ii.min + abs_max | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        return (wav.astype(dtype) - offset) / abs_max | 
				
			
			
		
	
	
		
			
				
					| 
						
						
						
					 | 
				
				 | 
				
					
  |