diff --git a/src/audio/base_fw.c b/src/audio/base_fw.c index b86db469765a..c5a874e41c54 100644 --- a/src/audio/base_fw.c +++ b/src/audio/base_fw.c @@ -100,6 +100,10 @@ static void get_codec_info(struct sof_tlv **tuple) codec_info.items[codec_info.count++] = SET_CODEC_INFO_ITEM(SND_AUDIOCODEC_VORBIS, SOF_IPC_STREAM_PLAYBACK); #endif +#ifdef CONFIG_COMP_MFCC + codec_info.items[codec_info.count++] = + SET_CODEC_INFO_ITEM(SND_AUDIOCODEC_BESPOKE, SOF_IPC_STREAM_CAPTURE); +#endif if (!codec_info.count) return; diff --git a/src/audio/mfcc/CMakeLists.txt b/src/audio/mfcc/CMakeLists.txt index f8af79d1ca8a..274c7aa05eb8 100644 --- a/src/audio/mfcc/CMakeLists.txt +++ b/src/audio/mfcc/CMakeLists.txt @@ -4,5 +4,8 @@ if(CONFIG_COMP_MFCC STREQUAL "m" AND DEFINED CONFIG_LLEXT) add_subdirectory(llext ${PROJECT_BINARY_DIR}/mfcc_llext) add_dependencies(app mfcc) else() - add_local_sources(sof mfcc.c mfcc_setup.c mfcc_common.c mfcc_generic.c mfcc_hifi4.c mfcc_hifi3.c) + add_local_sources(sof mfcc.c mfcc_setup.c mfcc_common.c mfcc_generic.c mfcc_hifi4.c mfcc_hifi3.c mfcc_vad.c) + if(CONFIG_IPC_MAJOR_4) + add_local_sources(sof mfcc_ipc4.c) + endif() endif() diff --git a/src/audio/mfcc/mfcc.c b/src/audio/mfcc/mfcc.c index ea09d919009b..971e088cc2cf 100644 --- a/src/audio/mfcc/mfcc.c +++ b/src/audio/mfcc/mfcc.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include #include @@ -36,29 +38,31 @@ LOG_MODULE_REGISTER(mfcc, CONFIG_SOF_LOG_LEVEL); SOF_DEFINE_REG_UUID(mfcc); -__cold_rodata const struct mfcc_func_map mfcc_fm[] = { +/** \brief Source/sink API based source copy function map. */ +struct mfcc_source_func_map { + uint8_t source; + mfcc_source_func func; +}; + +__cold_rodata static const struct mfcc_source_func_map mfcc_sfm[] = { #if CONFIG_FORMAT_S16LE - {SOF_IPC_FRAME_S16_LE, mfcc_s16_default}, -#endif /* CONFIG_FORMAT_S16LE */ + {SOF_IPC_FRAME_S16_LE, mfcc_source_copy_s16}, +#endif #if CONFIG_FORMAT_S24LE - {SOF_IPC_FRAME_S24_4LE, mfcc_s24_default}, -#endif /* CONFIG_FORMAT_S24LE */ + {SOF_IPC_FRAME_S24_4LE, mfcc_source_copy_s24}, +#endif #if CONFIG_FORMAT_S32LE - {SOF_IPC_FRAME_S32_LE, mfcc_s32_default}, -#endif /* CONFIG_FORMAT_S32LE */ + {SOF_IPC_FRAME_S32_LE, mfcc_source_copy_s32}, +#endif }; -static mfcc_func mfcc_find_func(enum sof_ipc_frame source_format, - enum sof_ipc_frame sink_format, - const struct mfcc_func_map *map, - int n) +static mfcc_source_func mfcc_find_source_func(enum sof_ipc_frame source_format) { int i; - /* Find suitable processing function from map. */ - for (i = 0; i < n; i++) { - if (source_format == map[i].source) - return map[i].func; + for (i = 0; i < ARRAY_SIZE(mfcc_sfm); i++) { + if (source_format == mfcc_sfm[i].source) + return mfcc_sfm[i].func; } return NULL; @@ -97,56 +101,47 @@ static int mfcc_free(struct processing_module *mod) struct mfcc_comp_data *cd = module_get_private_data(mod); comp_info(mod->dev, "entry"); + ipc_msg_free(cd->msg); + cd->msg = NULL; mod_data_blob_handler_free(mod, cd->model_handler); mfcc_free_buffers(mod); mod_free(mod, cd); return 0; } -static int mfcc_get_config(struct processing_module *mod, - uint32_t config_id, uint32_t *data_offset_size, - uint8_t *fragment, size_t fragment_size) -{ - struct sof_ipc_ctrl_data *cdata = (struct sof_ipc_ctrl_data *)fragment; - struct mfcc_comp_data *cd = module_get_private_data(mod); - - comp_info(mod->dev, "entry"); - - return comp_data_blob_get_cmd(cd->model_handler, cdata, fragment_size); -} - -static int mfcc_set_config(struct processing_module *mod, uint32_t config_id, - enum module_cfg_fragment_position pos, uint32_t data_offset_size, - const uint8_t *fragment, size_t fragment_size, uint8_t *response, - size_t response_size) -{ - struct mfcc_comp_data *cd = module_get_private_data(mod); - - comp_info(mod->dev, "entry"); - - return comp_data_blob_set(cd->model_handler, pos, data_offset_size, - fragment, fragment_size); -} +/** + * \brief Source/sink API based process function for MFCC. + * + * Reads input audio from sof_source, runs the STFT/Mel/DCT stage, and + * delegates output formatting and commit handling to mfcc_common.c. + */ static int mfcc_process(struct processing_module *mod, - struct input_stream_buffer *input_buffers, int num_input_buffers, - struct output_stream_buffer *output_buffers, int num_output_buffers) + struct sof_source **sources, int num_of_sources, + struct sof_sink **sinks, int num_of_sinks) { struct mfcc_comp_data *cd = module_get_private_data(mod); - struct audio_stream *source = input_buffers->data; - struct audio_stream *sink = output_buffers->data; - int frames = input_buffers->size; - - comp_dbg(mod->dev, "start"); - - frames = MIN(frames, cd->max_frames); - cd->mfcc_func(mod, input_buffers, output_buffers, frames); - - /* TODO: use module_update_buffer_position() from #6194 */ - input_buffers->consumed += audio_stream_frame_bytes(source) * frames; - output_buffers->size += audio_stream_frame_bytes(sink) * frames; - comp_dbg(mod->dev, "done"); - return 0; + struct comp_dev *dev = mod->dev; + struct mfcc_state *state = &cd->state; + size_t source_avail; + int frames; + int num_ceps; + + comp_dbg(dev, "start"); + source_avail = source_get_data_frames_available(sources[0]); + frames = MIN(source_avail, cd->max_frames); + if (frames == 0) + return -ENODATA; + + /* Copy input audio from source to MFCC internal circular buffer */ + cd->source_func(sources[0], &state->buf, &state->emph, frames, state->source_channel); + + /* Run STFT and Mel/DCT processing */ + num_ceps = mfcc_stft_process(mod, cd); + if (num_ceps < 0) + return num_ceps; + + return mfcc_process_output(mod, cd, sources, sinks, num_ceps, frames); } static int mfcc_prepare(struct processing_module *mod, @@ -187,22 +182,41 @@ static int mfcc_prepare(struct processing_module *mod, audio_stream_get_channels(&sourceb->stream)); if (ret < 0) { comp_err(dev, "setup failed."); - goto err; + return ret; } + } else { + comp_err(dev, "configuration is missing."); + return -EINVAL; } - cd->mfcc_func = mfcc_find_func(source_format, sink_format, mfcc_fm, ARRAY_SIZE(mfcc_fm)); - if (!cd->mfcc_func) { - comp_err(dev, "No proc func"); - ret = -EINVAL; - goto err; + cd->source_func = mfcc_find_source_func(source_format); + if (!cd->source_func) { + comp_err(dev, "No source func"); + mfcc_free_buffers(mod); + return -EINVAL; } - return 0; + cd->source_format = source_format; -err: - comp_set_state(dev, COMP_TRIGGER_RESET); - return ret; + if (cd->config->compress_output) + comp_info(dev, "compress PCM output mode enabled"); + + if (cd->config->enable_dtx && !cd->config->compress_output) + comp_warn(dev, "enable_dtx ignored in normal PCM mode, only applies to compress"); + + /* Initialize VAD switch control notification if enabled */ + if (cd->config->enable_vad && cd->config->update_controls) { + if (!cd->msg) { + ret = mfcc_ipc_notification_init(mod); + if (ret < 0) { + mfcc_free_buffers(mod); + return ret; + } + } + } + + cd->vad_prev = false; + return 0; } static int mfcc_reset(struct processing_module *mod) @@ -211,8 +225,13 @@ static int mfcc_reset(struct processing_module *mod) comp_info(mod->dev, "entry"); + /* Free MFCC buffers to prevent leaks on reset->prepare cycles. + * mfcc_free_buffers() NULLs the pointers after free. + */ + mfcc_free_buffers(mod); + /* Reset to similar state as init() */ - cd->mfcc_func = NULL; + cd->source_func = NULL; return 0; } @@ -221,7 +240,7 @@ static const struct module_interface mfcc_interface = { .free = mfcc_free, .set_configuration = mfcc_set_config, .get_configuration = mfcc_get_config, - .process_audio_stream = mfcc_process, + .process = mfcc_process, .prepare = mfcc_prepare, .reset = mfcc_reset, }; diff --git a/src/audio/mfcc/mfcc_common.c b/src/audio/mfcc/mfcc_common.c index 1079864e9259..4713df3d2566 100644 --- a/src/audio/mfcc/mfcc_common.c +++ b/src/audio/mfcc/mfcc_common.c @@ -7,7 +7,8 @@ #include #include -#include +#include +#include #include #include #include @@ -20,15 +21,156 @@ #include #include #include +#include + +#include LOG_MODULE_REGISTER(mfcc_common, CONFIG_SOF_LOG_LEVEL); +/* + * Source/sink API based source copy functions. + * These use sof_source API and are compiled on all platforms (generic, HiFi3, HiFi4). + */ + +#if CONFIG_FORMAT_S16LE +void mfcc_source_copy_s16(struct sof_source *source, struct mfcc_buffer *buf, + struct mfcc_pre_emph *emph, int frames, int source_channel) +{ + int16_t const *src_ptr; + int16_t const *src_start; + int src_samples; + int num_channels = source_get_channels(source); + size_t req_bytes = frames * num_channels * sizeof(int16_t); + int16_t *w = buf->w_ptr; + int16_t const *x; + int32_t s; + int ret; + int i; + + ret = source_get_data_s16(source, req_bytes, &src_ptr, &src_start, &src_samples); + if (ret) + return; + + x = src_ptr + source_channel; + for (i = 0; i < frames; i++) { + if (emph->enable) { + s = (int32_t)emph->delay * emph->coef + Q_SHIFT_LEFT(*x, 15, 30); + *w = sat_int16(Q_SHIFT_RND(s, 30, 15)); + emph->delay = *x; + } else { + *w = *x; + } + x += num_channels; + /* Wrap source pointer */ + if (x >= src_start + src_samples) + x -= src_samples; + + w++; + w = mfcc_buffer_wrap(buf, w); + } + + buf->s_avail += frames; + buf->s_free -= frames; + buf->w_ptr = w; + source_release_data(source, req_bytes); +} +#endif /* CONFIG_FORMAT_S16LE */ + +#if CONFIG_FORMAT_S24LE +void mfcc_source_copy_s24(struct sof_source *source, struct mfcc_buffer *buf, + struct mfcc_pre_emph *emph, int frames, int source_channel) +{ + int32_t const *src_ptr; + int32_t const *src_start; + int src_samples; + int num_channels = source_get_channels(source); + size_t req_bytes = frames * num_channels * sizeof(int32_t); + int16_t *w = buf->w_ptr; + int32_t const *x; + int32_t s, tmp; + int ret; + int i; + + ret = source_get_data_s32(source, req_bytes, &src_ptr, &src_start, &src_samples); + if (ret) + return; + + x = src_ptr + source_channel; + for (i = 0; i < frames; i++) { + if (emph->enable) { + s = (int32_t)((uint32_t)*x << 8); + tmp = (int32_t)emph->delay * emph->coef + Q_SHIFT(s, 31, 30); + *w = sat_int16(Q_SHIFT_RND(tmp, 30, 15)); + emph->delay = sat_int16(Q_SHIFT_RND(s, 31, 15)); + } else { + s = (int32_t)((uint32_t)*x << 8); + *w = sat_int16(Q_SHIFT_RND(s, 31, 15)); + } + x += num_channels; + if (x >= src_start + src_samples) + x -= src_samples; + + w++; + w = mfcc_buffer_wrap(buf, w); + } + + buf->s_avail += frames; + buf->s_free -= frames; + buf->w_ptr = w; + source_release_data(source, req_bytes); +} +#endif /* CONFIG_FORMAT_S24LE */ + +#if CONFIG_FORMAT_S32LE +void mfcc_source_copy_s32(struct sof_source *source, struct mfcc_buffer *buf, + struct mfcc_pre_emph *emph, int frames, int source_channel) +{ + int32_t const *src_ptr; + int32_t const *src_start; + int src_samples; + int num_channels = source_get_channels(source); + size_t req_bytes = frames * num_channels * sizeof(int32_t); + int16_t *w = buf->w_ptr; + int32_t const *x; + int32_t s; + int ret; + int i; + + ret = source_get_data_s32(source, req_bytes, &src_ptr, &src_start, &src_samples); + if (ret) + return; + + x = src_ptr + source_channel; + for (i = 0; i < frames; i++) { + if (emph->enable) { + s = (int32_t)emph->delay * emph->coef + Q_SHIFT(*x, 31, 30); + *w = sat_int16(Q_SHIFT_RND(s, 30, 15)); + emph->delay = sat_int16(Q_SHIFT_RND(*x, 31, 15)); + } else { + *w = sat_int16(Q_SHIFT_RND(*x, 31, 15)); + } + x += num_channels; + if (x >= src_start + src_samples) + x -= src_samples; + + w++; + w = mfcc_buffer_wrap(buf, w); + } + + buf->s_avail += frames; + buf->s_free -= frames; + buf->w_ptr = w; + source_release_data(source, req_bytes); +} +#endif /* CONFIG_FORMAT_S32LE */ + /* * The main processing function for MFCC */ -static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data *cd) +int mfcc_stft_process(struct processing_module *mod, struct mfcc_comp_data *cd) { + const struct comp_dev *dev = mod->dev; struct sof_mfcc_config *config = cd->config; struct mfcc_state *state = &cd->state; struct mfcc_buffer *buf = &state->buf; @@ -144,11 +286,6 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data * sat_int32(Q_MULTSR_32X32(s, config->mel_scale, 23, 12, 23)); } - /* Store Q9.7 version in mel_spectra for s16 output mode */ - for (j = 0; j < state->dct.num_in; j++) - state->mel_spectra->data[j] = - sat_int16(state->mel_log_32[j] >> 16); - /* Enable this to check mmax decay */ comp_dbg(dev, "state->mmax = %d", state->mmax); } else { @@ -169,343 +306,308 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data * cc_count += state->dct.num_out; } - } - return cc_count; -} - -void mfcc_fill_fft_buffer(struct mfcc_state *state) -{ - struct mfcc_buffer *buf = &state->buf; - struct mfcc_fft *fft = &state->fft; - int32_t *d = &fft->fft_buf[fft->fft_fill_start_idx].real; - const int fft_elem_inc = sizeof(fft->fft_buf[0]) / sizeof(int32_t); - int16_t *prev = state->prev_data; - int16_t *prev_end = prev + state->prev_data_size; - int16_t *r = buf->r_ptr; - int copied; - int nmax; - int n; - int j; + /* Use hop counter for frame numbering (independent of VAD enable) */ + state->header.frame_number = state->hop_count; - /* Copy overlapped samples from state buffer. The fft_buf has been - * cleared by caller so imaginary part remains zero. - */ - while (prev < prev_end) { - *d = *prev++; - d += fft_elem_inc; - } + /* Run VAD on the mel log spectrum (available in both modes) */ + if (config->enable_vad) { + mfcc_vad_update(&cd->vad, state->mel_log_32); - /* Copy hop size of new data from circular buffer */ - for (copied = 0; copied < fft->fft_hop_size; copied += n) { - nmax = fft->fft_hop_size - copied; - n = mfcc_buffer_samples_without_wrap(buf, r); - n = MIN(n, nmax); - for (j = 0; j < n; j++) { - *d = *r++; - d += fft_elem_inc; + /* Populate data header for this output frame */ + state->header.energy = cd->vad.energy; + state->header.noise_energy = cd->vad.noise_energy; + state->header.vad_flag = cd->vad.is_speech ? 1 : 0; } - r = mfcc_buffer_wrap(buf, r); - } - buf->s_avail -= copied; - buf->s_free += copied; - buf->r_ptr = r; + /* Increment hop counter at end of hop processing */ + state->hop_count++; - /* Copy for next time data back to overlap buffer */ - d = (int32_t *)&fft->fft_buf[fft->fft_fill_start_idx + fft->fft_hop_size].real; - prev = state->prev_data; - while (prev < prev_end) { - *prev++ = *d; - d += fft_elem_inc; - } -} + /* Send notification when VAD state changes */ + if (config->enable_vad && config->update_controls) { + bool vad_now = cd->vad.is_speech; -#if CONFIG_FORMAT_S16LE -static int16_t *mfcc_sink_copy_zero_s16(const struct audio_stream *sink, int16_t *w_ptr, - int samples) -{ - int copied; - int nmax; - int n; - - for (copied = 0; copied < samples; copied += n) { - nmax = samples - copied; - n = audio_stream_samples_without_wrap_s16(sink, w_ptr); - n = MIN(n, nmax); - memset(w_ptr, 0, n * sizeof(int16_t)); - w_ptr = audio_stream_wrap(sink, w_ptr + n); + if (vad_now != cd->vad_prev) { + mfcc_send_vad_notification(mod, vad_now ? 1 : 0); + cd->vad_prev = vad_now; + } + } } - return w_ptr; + return cc_count; } -static int16_t *mfcc_sink_copy_data_s16(const struct audio_stream *sink, int16_t *w_ptr, - int samples, int16_t *r_ptr) +/** + * \brief Write bytes into a possibly wrapped sink buffer. + */ +static size_t mfcc_sink_write_bytes(uint8_t **dst, uint8_t *buf_start, + size_t buf_size, const uint8_t *src, + size_t max_bytes) { - int copied; - int nmax; - int n; + uint8_t *buf_end = buf_start + buf_size; + size_t chunk; - for (copied = 0; copied < samples; copied += n) { - nmax = samples - copied; - n = audio_stream_samples_without_wrap_s16(sink, w_ptr); - n = MIN(n, nmax); - /* Not using memcpy_s() due to speed need */ - memcpy(w_ptr, r_ptr, n * sizeof(int16_t)); - w_ptr = audio_stream_wrap(sink, w_ptr + n); - r_ptr += n; + if (max_bytes == 0) + return 0; + + chunk = MIN(max_bytes, (size_t)(buf_end - *dst)); + memcpy(*dst, src, chunk); + if (chunk < max_bytes) { + memcpy(buf_start, src + chunk, max_bytes - chunk); + *dst = buf_start + (max_bytes - chunk); + } else { + *dst += chunk; + if (*dst >= buf_end) + *dst = buf_start; } - return w_ptr; + return max_bytes; } -void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer *bsource, - struct output_stream_buffer *bsink, int frames) +/** + * \brief Prepare the next MFCC output frame after STFT processing. + */ +static void mfcc_prepare_output(struct mfcc_state *state, int num_ceps) { - struct audio_stream *sink = bsink->data; - struct mfcc_comp_data *cd = module_get_private_data(mod); - struct mfcc_state *state = &cd->state; - struct mfcc_buffer *buf = &cd->state.buf; - uint32_t magic = MFCC_MAGIC; - int16_t *w_ptr = audio_stream_get_wptr(sink); - const int num_magic = 2; - int num_ceps; - int sink_samples; - int to_copy; - - /* Get samples from source buffer */ - mfcc_source_copy_s16(bsource, buf, &state->emph, frames, state->source_channel); - - /* Run STFT and processing after FFT: Mel auditory filter and DCT. */ - num_ceps = mfcc_stft_process(mod->dev, cd); - - /* If new output produced, set up pointer into scratch data and mark magic pending */ - if (num_ceps > 0) { - if (state->mel_only) - state->out_data_ptr = state->mel_spectra->data; - else - state->out_data_ptr = state->cepstral_coef->data; - - state->out_remain = num_ceps; - state->magic_pending = true; - } + int k; - /* Write to sink, limited by period size */ - sink_samples = frames * audio_stream_get_channels(sink); + if (num_ceps <= 0) + return; - /* Write magic word first if pending */ - if (state->magic_pending && sink_samples >= num_magic) { - w_ptr = mfcc_sink_copy_data_s16(sink, w_ptr, num_magic, (int16_t *)&magic); - sink_samples -= num_magic; - state->magic_pending = false; - } + if (state->mel_only) { + state->out_data_ptr = state->mel_log_32; + } else { + /* Widen int16 Q9.7 cepstral coefficients to int32 Q9.23. + * Safe to copy forward: cepstral_coef is in fft_out while + * mel_log_32 is in fft_buf (separate scratch buffers). + */ + for (k = 0; k < num_ceps; k++) + state->mel_log_32[k] = (int32_t)state->cepstral_coef->data[k] << 16; - /* Write cepstral/mel data from scratch buffer */ - to_copy = MIN(state->out_remain, sink_samples); - if (to_copy > 0) { - w_ptr = mfcc_sink_copy_data_s16(sink, w_ptr, to_copy, state->out_data_ptr); - state->out_data_ptr += to_copy; - state->out_remain -= to_copy; - sink_samples -= to_copy; + state->out_data_ptr = state->mel_log_32; } - /* Zero-fill remaining sink samples */ - w_ptr = mfcc_sink_copy_zero_s16(sink, w_ptr, sink_samples); + state->out_remain = num_ceps; + state->header_pending = true; } -#endif /* CONFIG_FORMAT_S16LE */ -#if CONFIG_FORMAT_S24LE || CONFIG_FORMAT_S32LE -static int32_t *mfcc_sink_copy_zero_s32(const struct audio_stream *sink, int32_t *w_ptr, - int samples) +/** + * \brief Commit MFCC output in compress mode. + */ +static int mfcc_output_compress(struct processing_module *mod, struct mfcc_comp_data *cd, + struct sof_sink **sinks, int num_ceps) { - int copied; - int nmax; - int n; - - for (copied = 0; copied < samples; copied += n) { - nmax = samples - copied; - n = audio_stream_samples_without_wrap_s32(sink, w_ptr); - n = MIN(n, nmax); - memset(w_ptr, 0, n * sizeof(int32_t)); - w_ptr = audio_stream_wrap(sink, w_ptr + n); + struct comp_dev *dev = mod->dev; + struct mfcc_state *state = &cd->state; + size_t out_bytes; + size_t commit_bytes; + void *sink_ptr; + void *sink_start; + size_t sink_buf_size; + int ret; + + if (num_ceps <= 0) + return 0; + + out_bytes = sizeof(state->header) + num_ceps * sizeof(int32_t); + + if (cd->config->enable_vad && !cd->vad.is_speech) { + state->vad_silence_count++; + /* With DTX enabled, send trailing silence frames + * (configurable count) then suppress. After trailing + * frames, optionally send periodic silence updates + * at the configured interval. This gives the host + * enough silence to detect end-of-speech while + * keeping alive updates during long silence. + * Without DTX, output every frame regardless of VAD. + */ + if (cd->config->enable_dtx) { + if (state->vad_silence_count > state->dtx_trailing_silence) { + /* Check periodic silence frame send */ + if (state->dtx_silence_interval > 0) { + state->dtx_silence_counter++; + if (state->dtx_silence_counter >= state->dtx_silence_interval) { + state->dtx_silence_counter = 0; + goto send_frame; + } + } + state->header_pending = false; + state->out_remain = 0; + return 0; + } + } + } else { + state->vad_silence_count = 0; + state->dtx_silence_counter = 0; } - return w_ptr; -} +send_frame: + commit_bytes = out_bytes; -static int32_t *mfcc_sink_copy_data_s32(const struct audio_stream *sink, int32_t *w_ptr, - int samples, int32_t *r_ptr) -{ - int copied; - int nmax; - int n; + if (sink_get_free_size(sinks[0]) < commit_bytes) + return -ENOSPC; - for (copied = 0; copied < samples; copied += n) { - nmax = samples - copied; - n = audio_stream_samples_without_wrap_s32(sink, w_ptr); - n = MIN(n, nmax); - /* Not using memcpy_s() due to speed need */ - memcpy(w_ptr, r_ptr, n * sizeof(int32_t)); - w_ptr = audio_stream_wrap(sink, w_ptr + n); - r_ptr += n; + ret = sink_get_buffer(sinks[0], commit_bytes, &sink_ptr, + &sink_start, &sink_buf_size); + if (ret) + return ret; + + { + uint8_t *dst = sink_ptr; + + mfcc_sink_write_bytes(&dst, sink_start, sink_buf_size, + (uint8_t *)&state->header, sizeof(state->header)); + mfcc_sink_write_bytes(&dst, sink_start, sink_buf_size, + (uint8_t *)state->out_data_ptr, + num_ceps * sizeof(int32_t)); } - return w_ptr; + state->header_pending = false; + state->out_remain = 0; + + sink_commit_buffer(sinks[0], commit_bytes); + comp_dbg(dev, "done, produced %zu bytes", commit_bytes); + return 0; } -#endif /* CONFIG_FORMAT_S24LE || CONFIG_FORMAT_S32LE */ -#if CONFIG_FORMAT_S24LE -void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer *bsource, - struct output_stream_buffer *bsink, int frames) +/** + * \brief Commit MFCC output in legacy PCM mode. + */ +static int mfcc_output_legacy(struct processing_module *mod, struct mfcc_comp_data *cd, + struct sof_source **sources, struct sof_sink **sinks, + int frames) { - struct audio_stream *sink = bsink->data; - struct mfcc_comp_data *cd = module_get_private_data(mod); + struct comp_dev *dev = mod->dev; struct mfcc_state *state = &cd->state; - struct mfcc_buffer *buf = &cd->state.buf; - uint32_t magic = MFCC_MAGIC; - int32_t *w_ptr = audio_stream_get_wptr(sink); - const int num_magic = 1; /* one int32_t word for magic */ - int num_ceps; - int sink_samples; - int remain_s32; - int to_copy; - int k; - - /* Get samples from source buffer */ - mfcc_source_copy_s24(bsource, buf, &state->emph, frames, state->source_channel); - - /* Run STFT and processing after FFT */ - num_ceps = mfcc_stft_process(mod->dev, cd); - - /* If new output produced, set up pointer into scratch data */ - if (num_ceps > 0) { - if (state->mel_only) { - /* Convert mel_log_32 from Q9.23 to Q9.15 in-place */ - for (k = 0; k < num_ceps; k++) - state->mel_log_32[k] >>= 8; - - state->out_data_ptr_32 = state->mel_log_32; - } else { - state->out_data_ptr = state->cepstral_coef->data; + size_t commit_bytes; + void *sink_ptr; + void *sink_start; + size_t sink_buf_size; + int ret; + + commit_bytes = sink_get_frame_bytes(sinks[0]); + commit_bytes *= frames; + + if (sink_get_free_size(sinks[0]) < commit_bytes) + return -ENOSPC; + + ret = sink_get_buffer(sinks[0], commit_bytes, &sink_ptr, + &sink_start, &sink_buf_size); + if (ret) + return ret; + + /* Zero-fill entire period first */ + { + size_t bytes_to_end = (size_t)((uint8_t *)sink_start + sink_buf_size - + (uint8_t *)sink_ptr); + + if (bytes_to_end >= commit_bytes) + memset(sink_ptr, 0, commit_bytes); + else { + memset(sink_ptr, 0, bytes_to_end); + memset(sink_start, 0, commit_bytes - bytes_to_end); } - - state->out_remain = num_ceps; - state->magic_pending = true; } - /* Write to sink, limited by period size */ - sink_samples = frames * audio_stream_get_channels(sink); + { + uint8_t *dst = sink_ptr; + size_t avail = commit_bytes; - /* Write magic word first if pending */ - if (state->magic_pending && sink_samples >= num_magic) { - w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_magic, (int32_t *)&magic); - sink_samples -= num_magic; - state->magic_pending = false; - } + /* Write pending header */ + if (state->header_pending && avail > 0) { + size_t hdr_size = sizeof(state->header); - if (state->mel_only) { - /* Write 32-bit mel data Q9.15, one value per int32_t */ - to_copy = MIN(state->out_remain, sink_samples); - if (to_copy > 0) { - w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, to_copy, - state->out_data_ptr_32); - state->out_data_ptr_32 += to_copy; - state->out_remain -= to_copy; - sink_samples -= to_copy; + if (avail >= hdr_size) { + mfcc_sink_write_bytes(&dst, sink_start, sink_buf_size, + (uint8_t *)&state->header, hdr_size); + avail -= hdr_size; + state->header_pending = false; + } } - } else { - /* Write cepstral data packed as int32_t from scratch buffer */ - remain_s32 = (state->out_remain + 1) / 2; - to_copy = MIN(remain_s32, sink_samples); - if (to_copy > 0) { - w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, to_copy, - (int32_t *)state->out_data_ptr); - state->out_data_ptr += to_copy * 2; - state->out_remain -= to_copy * 2; - if (state->out_remain < 0) - state->out_remain = 0; - sink_samples -= to_copy; + /* Write pending feature data (always int32) */ + if (state->out_remain > 0 && avail > 0) { + size_t data_bytes; + size_t to_write; + + data_bytes = state->out_remain * sizeof(int32_t); + to_write = MIN(data_bytes, avail) & ~(size_t)3; + if (to_write > 0) { + int n32; + + mfcc_sink_write_bytes(&dst, sink_start, sink_buf_size, + (uint8_t *)state->out_data_ptr, + to_write); + n32 = to_write / sizeof(int32_t); + state->out_data_ptr += n32; + state->out_remain -= n32; + } } } - /* Zero-fill remaining sink samples */ - w_ptr = mfcc_sink_copy_zero_s32(sink, w_ptr, sink_samples); + sink_commit_buffer(sinks[0], commit_bytes); + comp_dbg(dev, "done, produced %zu bytes", commit_bytes); + return 0; } -#endif /* CONFIG_FORMAT_S24LE */ -#if CONFIG_FORMAT_S32LE -void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer *bsource, - struct output_stream_buffer *bsink, int frames) +int mfcc_process_output(struct processing_module *mod, struct mfcc_comp_data *cd, + struct sof_source **sources, struct sof_sink **sinks, + int num_ceps, int frames) { - struct audio_stream *sink = bsink->data; - struct mfcc_comp_data *cd = module_get_private_data(mod); - struct mfcc_state *state = &cd->state; - struct mfcc_buffer *buf = &cd->state.buf; - uint32_t magic = MFCC_MAGIC; - int32_t *w_ptr = audio_stream_get_wptr(sink); - const int num_magic = 1; /* one int32_t word for magic */ - int num_ceps; - int sink_samples; - int remain_s32; - int to_copy; - - /* Get samples from source buffer */ - mfcc_source_copy_s32(bsource, buf, &state->emph, frames, state->source_channel); - - /* Run STFT and processing after FFT */ - num_ceps = mfcc_stft_process(mod->dev, cd); - - /* If new output produced, set up pointer into scratch data */ - if (num_ceps > 0) { - if (state->mel_only) { - state->out_data_ptr_32 = state->mel_log_32; - } else { - state->out_data_ptr = state->cepstral_coef->data; - } + if (num_ceps > 0) + mfcc_prepare_output(&cd->state, num_ceps); - state->out_remain = num_ceps; - state->magic_pending = true; - } + if (cd->config->compress_output) + return mfcc_output_compress(mod, cd, sinks, num_ceps); - /* Write to sink, limited by period size */ - sink_samples = frames * audio_stream_get_channels(sink); + return mfcc_output_legacy(mod, cd, sources, sinks, frames); +} - /* Write magic word first if pending */ - if (state->magic_pending && sink_samples >= num_magic) { - w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_magic, (int32_t *)&magic); - sink_samples -= num_magic; - state->magic_pending = false; - } +void mfcc_fill_fft_buffer(struct mfcc_state *state) +{ + struct mfcc_buffer *buf = &state->buf; + struct mfcc_fft *fft = &state->fft; + int32_t *d = &fft->fft_buf[fft->fft_fill_start_idx].real; + const int fft_elem_inc = sizeof(fft->fft_buf[0]) / sizeof(int32_t); + int16_t *prev = state->prev_data; + int16_t *prev_end = prev + state->prev_data_size; + int16_t *r = buf->r_ptr; + int copied; + int nmax; + int n; + int j; - if (state->mel_only) { - /* Write 32-bit mel data Q9.23, one value per int32_t */ - to_copy = MIN(state->out_remain, sink_samples); - if (to_copy > 0) { - w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, to_copy, - state->out_data_ptr_32); - state->out_data_ptr_32 += to_copy; - state->out_remain -= to_copy; - sink_samples -= to_copy; - } - } else { - /* Write cepstral data packed as int32_t from scratch buffer */ - remain_s32 = (state->out_remain + 1) / 2; - to_copy = MIN(remain_s32, sink_samples); - if (to_copy > 0) { - w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, to_copy, - (int32_t *)state->out_data_ptr); - state->out_data_ptr += to_copy * 2; - state->out_remain -= to_copy * 2; - if (state->out_remain < 0) - state->out_remain = 0; + /* Copy overlapped samples from state buffer. The fft_buf has been + * cleared by caller so imaginary part remains zero. + */ + while (prev < prev_end) { + *d = *prev++; + d += fft_elem_inc; + } - sink_samples -= to_copy; + /* Copy hop size of new data from circular buffer */ + for (copied = 0; copied < fft->fft_hop_size; copied += n) { + nmax = fft->fft_hop_size - copied; + n = mfcc_buffer_samples_without_wrap(buf, r); + n = MIN(n, nmax); + for (j = 0; j < n; j++) { + *d = *r++; + d += fft_elem_inc; } + r = mfcc_buffer_wrap(buf, r); } - /* Zero-fill remaining sink samples */ - w_ptr = mfcc_sink_copy_zero_s32(sink, w_ptr, sink_samples); + buf->s_avail -= copied; + buf->s_free += copied; + buf->r_ptr = r; + + /* Copy for next time data back to overlap buffer */ + d = (int32_t *)&fft->fft_buf[fft->fft_fill_start_idx + fft->fft_hop_size].real; + prev = state->prev_data; + while (prev < prev_end) { + *prev++ = *d; + d += fft_elem_inc; + } } -#endif /* CONFIG_FORMAT_S32LE */ + + diff --git a/src/audio/mfcc/mfcc_generic.c b/src/audio/mfcc/mfcc_generic.c index 73ac49272ed4..d5eaf65ba091 100644 --- a/src/audio/mfcc/mfcc_generic.c +++ b/src/audio/mfcc/mfcc_generic.c @@ -8,7 +8,6 @@ #ifdef MFCC_GENERIC #include -#include #include #include #include @@ -64,161 +63,4 @@ void mfcc_apply_window(struct mfcc_state *state, int input_shift) fft->fft_buf[i + j].real = (fft->fft_buf[i + j].real * state->window[j]) << s; } -#if CONFIG_FORMAT_S16LE -void mfcc_source_copy_s16(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, - struct mfcc_pre_emph *emph, int frames, int source_channel) -{ - struct audio_stream *source = bsource->data; - int32_t s; - int16_t *x0; - int16_t *x = audio_stream_get_rptr(source); - int16_t *w = buf->w_ptr; - int copied; - int nmax; - int n1; - int n2; - int n; - int i; - int num_channels = audio_stream_get_channels(source); - - /* Copy from source to pre-buffer for FFT. - * The pre-emphasis filter is done in this step. - */ - for (copied = 0; copied < frames; copied += n) { - nmax = frames - copied; - n1 = audio_stream_frames_without_wrap(source, x); - n2 = mfcc_buffer_samples_without_wrap(buf, w); - n = MIN(n1, n2); - n = MIN(n, nmax); - x0 = x + source_channel; - for (i = 0; i < n; i++) { - if (emph->enable) { - /* Q1.15 x Q1.15 -> Q2.30 */ - s = (int32_t)emph->delay * emph->coef + Q_SHIFT_LEFT(*x0, 15, 30); - *w = sat_int16(Q_SHIFT_RND(s, 30, 15)); - emph->delay = *x0; - } else { - *w = *x0; - } - x0 += num_channels; - w++; - } - - x = audio_stream_wrap(source, x + n * audio_stream_get_channels(source)); - w = mfcc_buffer_wrap(buf, w); - } - buf->s_avail += copied; - buf->s_free -= copied; - buf->w_ptr = w; -} -#endif /* CONFIG_FORMAT_S16LE */ - -#if CONFIG_FORMAT_S24LE - -void mfcc_source_copy_s24(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, - struct mfcc_pre_emph *emph, int frames, int source_channel) -{ - struct audio_stream *source = bsource->data; - int32_t tmp, s; - int32_t *x0; - int32_t *x = audio_stream_get_rptr(source); - int16_t *w = buf->w_ptr; - int copied; - int nmax; - int n1; - int n2; - int n; - int i; - int num_channels = audio_stream_get_channels(source); - - /* Copy from source to pre-buffer for FFT. - * The pre-emphasis filter is done in this step. - * S24_4LE data is in 32-bit container, shift left by 8 to Q1.31, - * then convert to Q1.15 with rounding. - */ - for (copied = 0; copied < frames; copied += n) { - nmax = frames - copied; - n1 = audio_stream_frames_without_wrap(source, x); - n2 = mfcc_buffer_samples_without_wrap(buf, w); - n = MIN(n1, n2); - n = MIN(n, nmax); - x0 = x + source_channel; - for (i = 0; i < n; i++) { - if (emph->enable) { - /* Convert to Q1.31, ignore highest byte */ - s = (int32_t)((uint32_t)*x0 << 8); - /* Q1.15 x Q1.15 -> Q2.30 */ - tmp = (int32_t)emph->delay * emph->coef + Q_SHIFT(s, 31, 30); - *w = sat_int16(Q_SHIFT_RND(tmp, 30, 15)); - emph->delay = sat_int16(Q_SHIFT_RND(s, 31, 15)); - } else { - /* Convert to Q1.31, ignore highest byte */ - s = (int32_t)((uint32_t)*x0 << 8); - *w = sat_int16(Q_SHIFT_RND(s, 31, 15)); - } - x0 += num_channels; - w++; - } - - x = audio_stream_wrap(source, x + n * audio_stream_get_channels(source)); - w = mfcc_buffer_wrap(buf, w); - } - buf->s_avail += copied; - buf->s_free -= copied; - buf->w_ptr = w; -} - -#endif /* CONFIG_FORMAT_S24LE */ - -#if CONFIG_FORMAT_S32LE - -void mfcc_source_copy_s32(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, - struct mfcc_pre_emph *emph, int frames, int source_channel) -{ - struct audio_stream *source = bsource->data; - int32_t s; - int32_t *x0; - int32_t *x = audio_stream_get_rptr(source); - int16_t *w = buf->w_ptr; - int copied; - int nmax; - int n1; - int n2; - int n; - int i; - int num_channels = audio_stream_get_channels(source); - - /* Copy from source to pre-buffer for FFT. - * The pre-emphasis filter is done in this step. - * S32 data is in 32-bit container, shift right by 16 to get 16-bit. - */ - for (copied = 0; copied < frames; copied += n) { - nmax = frames - copied; - n1 = audio_stream_frames_without_wrap(source, x); - n2 = mfcc_buffer_samples_without_wrap(buf, w); - n = MIN(n1, n2); - n = MIN(n, nmax); - x0 = x + source_channel; - for (i = 0; i < n; i++) { - if (emph->enable) { - /* Q1.15 x Q1.15 -> Q2.30 */ - s = (int32_t)emph->delay * emph->coef + Q_SHIFT(*x0, 31, 30); - *w = sat_int16(Q_SHIFT_RND(s, 30, 15)); - emph->delay = sat_int16(Q_SHIFT_RND(*x0, 31, 15)); - } else { - *w = sat_int16(Q_SHIFT_RND(*x0, 31, 15)); - } - x0 += num_channels; - w++; - } - - x = audio_stream_wrap(source, x + n * audio_stream_get_channels(source)); - w = mfcc_buffer_wrap(buf, w); - } - buf->s_avail += copied; - buf->s_free -= copied; - buf->w_ptr = w; -} -#endif /* CONFIG_FORMAT_S32LE */ - #endif /* MFCC_GENERIC */ diff --git a/src/audio/mfcc/mfcc_hifi3.c b/src/audio/mfcc/mfcc_hifi3.c index 80c384ad6c64..8b6a01e1f40d 100644 --- a/src/audio/mfcc/mfcc_hifi3.c +++ b/src/audio/mfcc/mfcc_hifi3.c @@ -9,7 +9,6 @@ #ifdef MFCC_HIFI3 #include -#include #include #include #include @@ -35,66 +34,6 @@ static inline void set_circular_buf0(const void *start, const void *end) * MFCC algorithm code */ -#if CONFIG_FORMAT_S16LE -void mfcc_source_copy_s16(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, - struct mfcc_pre_emph *emph, int frames, int source_channel) -{ - struct audio_stream *source = bsource->data; - int copied; - int nmax; - int n; - int i; - int num_channels = audio_stream_get_channels(source); - ae_int16 *in; - ae_int16 *x = (ae_int16 *)audio_stream_get_rptr(source); - ae_int16 *out = (ae_int16 *)buf->w_ptr; - ae_int16x4 sample; - ae_int32x2 temp; - ae_int16x4 coef = emph->coef; - ae_int16x4 delay; - const int in_inc = sizeof(ae_int16) * num_channels; - - /* Copy from source to pre-buffer for FFT. - * The pre-emphasis filter is done in this step. - */ - for (copied = 0; copied < frames; copied += n) { - nmax = frames - copied; - n = audio_stream_frames_without_wrap(source, x); - n = MIN(n, nmax); - nmax = mfcc_buffer_samples_without_wrap(buf, (int16_t *)out); - n = MIN(n, nmax); - in = x + source_channel; - if (emph->enable) { - delay = emph->delay; - for (i = 0; i < n; i++) { - AE_L16_XP(sample, in, in_inc); - /* Q1.15 -> Q1.31 */ - temp = AE_CVT32X2F16_10(sample); - AE_MULAF16SS_00(temp, delay, coef); - delay = sample; - sample = AE_ROUND16X4F32SSYM(temp, temp); - /* 2 = sizeof(ae_int16)*/ - AE_S16_0_IP(sample, out, 2); - } - emph->delay = delay; - - } else { - for (i = 0; i < n; i++) { - AE_L16_XP(sample, in, in_inc); - /* 2 = sizeof(ae_int16)*/ - AE_S16_0_IP(sample, out, 2); - } - } - - x = audio_stream_wrap(source, x + n * num_channels); - out = (ae_int16 *)mfcc_buffer_wrap(buf, (int16_t *)out); - } - buf->s_avail += copied; - buf->s_free -= copied; - buf->w_ptr = (int16_t *)out; -} -#endif /* CONFIG_FORMAT_S16LE */ - void mfcc_fill_prev_samples(struct mfcc_buffer *buf, int16_t *prev_data, int prev_data_length) { @@ -152,129 +91,4 @@ void mfcc_apply_window(struct mfcc_state *state, int input_shift) } } -#if CONFIG_FORMAT_S24LE -void mfcc_source_copy_s24(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, - struct mfcc_pre_emph *emph, int frames, int source_channel) -{ - struct audio_stream *source = bsource->data; - int copied; - int nmax; - int n; - int i; - int num_channels = audio_stream_get_channels(source); - ae_int32 *in; - ae_int32 *x = (ae_int32 *)audio_stream_get_rptr(source); - ae_int16 *out = (ae_int16 *)buf->w_ptr; - ae_int32x2 sample32; - ae_int16x4 sample; - ae_int32x2 temp; - ae_int16x4 coef = emph->coef; - ae_int16x4 delay; - const int in_inc = sizeof(ae_int32) * num_channels; - - for (copied = 0; copied < frames; copied += n) { - nmax = frames - copied; - n = audio_stream_frames_without_wrap(source, x); - n = MIN(n, nmax); - nmax = mfcc_buffer_samples_without_wrap(buf, (int16_t *)out); - n = MIN(n, nmax); - in = x + source_channel; - if (emph->enable) { - delay = emph->delay; - for (i = 0; i < n; i++) { - AE_L32_XP(sample32, in, in_inc); - /* Shift left by 8 to sign-extend to Q1.31 */ - sample32 = AE_SLAI32(sample32, 8); - /* Then shift right by 16 to get 16-bit */ - sample32 = AE_SRAI32(sample32, 16); - sample = AE_SAT16X4(sample32, sample32); - /* Q1.15 -> Q1.31 */ - temp = AE_CVT32X2F16_10(sample); - AE_MULAF16SS_00(temp, delay, coef); - delay = sample; - sample = AE_ROUND16X4F32SSYM(temp, temp); - AE_S16_0_IP(sample, out, 2); - } - emph->delay = delay; - } else { - for (i = 0; i < n; i++) { - AE_L32_XP(sample32, in, in_inc); - /* Shift left by 8 to sign-extend to Q1.31 */ - sample32 = AE_SLAI32(sample32, 8); - /* Then shift right by 16 to get 16-bit */ - sample32 = AE_SRAI32(sample32, 16); - sample = AE_SAT16X4(sample32, sample32); - AE_S16_0_IP(sample, out, 2); - } - } - - x = audio_stream_wrap(source, x + n * num_channels); - out = (ae_int16 *)mfcc_buffer_wrap(buf, (int16_t *)out); - } - buf->s_avail += copied; - buf->s_free -= copied; - buf->w_ptr = (int16_t *)out; -} -#endif /* CONFIG_FORMAT_S24LE */ - -#if CONFIG_FORMAT_S32LE -void mfcc_source_copy_s32(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, - struct mfcc_pre_emph *emph, int frames, int source_channel) -{ - struct audio_stream *source = bsource->data; - int copied; - int nmax; - int n; - int i; - int num_channels = audio_stream_get_channels(source); - ae_int32 *in; - ae_int32 *x = (ae_int32 *)audio_stream_get_rptr(source); - ae_int16 *out = (ae_int16 *)buf->w_ptr; - ae_int32x2 sample32; - ae_int16x4 sample; - ae_int32x2 temp; - ae_int16x4 coef = emph->coef; - ae_int16x4 delay; - const int in_inc = sizeof(ae_int32) * num_channels; - - for (copied = 0; copied < frames; copied += n) { - nmax = frames - copied; - n = audio_stream_frames_without_wrap(source, x); - n = MIN(n, nmax); - nmax = mfcc_buffer_samples_without_wrap(buf, (int16_t *)out); - n = MIN(n, nmax); - in = x + source_channel; - if (emph->enable) { - delay = emph->delay; - for (i = 0; i < n; i++) { - AE_L32_XP(sample32, in, in_inc); - /* S32: shift right by 16 to get 16-bit */ - sample32 = AE_SRAI32(sample32, 16); - sample = AE_SAT16X4(sample32, sample32); - /* Q1.15 -> Q1.31 */ - temp = AE_CVT32X2F16_10(sample); - AE_MULAF16SS_00(temp, delay, coef); - delay = sample; - sample = AE_ROUND16X4F32SSYM(temp, temp); - AE_S16_0_IP(sample, out, 2); - } - emph->delay = delay; - } else { - for (i = 0; i < n; i++) { - AE_L32_XP(sample32, in, in_inc); - sample32 = AE_SRAI32(sample32, 16); - sample = AE_SAT16X4(sample32, sample32); - AE_S16_0_IP(sample, out, 2); - } - } - - x = audio_stream_wrap(source, x + n * num_channels); - out = (ae_int16 *)mfcc_buffer_wrap(buf, (int16_t *)out); - } - buf->s_avail += copied; - buf->s_free -= copied; - buf->w_ptr = (int16_t *)out; -} -#endif /* CONFIG_FORMAT_S32LE */ - #endif /* MFCC_HIFI3 */ diff --git a/src/audio/mfcc/mfcc_hifi4.c b/src/audio/mfcc/mfcc_hifi4.c index 63986870793b..8cd956fcb079 100644 --- a/src/audio/mfcc/mfcc_hifi4.c +++ b/src/audio/mfcc/mfcc_hifi4.c @@ -9,7 +9,6 @@ #ifdef MFCC_HIFI4 #include -#include #include #include #include @@ -31,66 +30,10 @@ static inline void set_circular_buf0(const void *start, const void *end) AE_SETCEND0(end); } -/* Setup circular for buffer 1 */ -static inline void set_circular_buf1(const void *start, const void *end) -{ - AE_SETCBEGIN1(start); - AE_SETCEND1(end); -} - /* * MFCC algorithm code */ -#if CONFIG_FORMAT_S16LE -void mfcc_source_copy_s16(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, - struct mfcc_pre_emph *emph, int frames, int source_channel) -{ - struct audio_stream *source = bsource->data; - int num_channels = audio_stream_get_channels(source); - ae_int16 *in = (ae_int16 *)source->r_ptr + source_channel; - ae_int16 *out = (ae_int16 *)buf->w_ptr; - ae_int16x4 sample; - ae_int32x2 temp; - ae_int16x4 coef; - ae_int16x4 delay; - const int in_inc = sizeof(ae_int16) * num_channels; - const int out_inc = sizeof(ae_int16); - int i; - - set_circular_buf1(buf->addr, buf->end_addr); - set_circular_buf0(source->addr, source->end_addr); - - /* Copy from source to pre-buffer for FFT. - * The pre-emphasis filter is done in this step. - */ - if (emph->enable) { - delay = emph->delay; - coef = emph->coef; - for (i = 0; i < frames; i++) { - AE_L16_XC(sample, in, in_inc); - - /* Q1.15 -> Q1.31 */ - temp = AE_CVT32X2F16_10(sample); - AE_MULAF16SS_00(temp, delay, coef); - delay = sample; - sample = AE_ROUND16X4F32SSYM(temp, temp); - AE_S16_0_XC1(sample, out, out_inc); - } - emph->delay = delay; - } else { - for (i = 0; i < frames; i++) { - AE_L16_XC(sample, in, in_inc); - AE_S16_0_XC1(sample, out, out_inc); - } - } - - buf->s_avail += frames; - buf->s_free -= frames; - buf->w_ptr = (int16_t *)out; -} -#endif /* CONFIG_FORMAT_S16LE */ - void mfcc_fill_prev_samples(struct mfcc_buffer *buf, int16_t *prev_data, int prev_data_length) { @@ -148,111 +91,4 @@ void mfcc_apply_window(struct mfcc_state *state, int input_shift) } } -#if CONFIG_FORMAT_S24LE -void mfcc_source_copy_s24(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, - struct mfcc_pre_emph *emph, int frames, int source_channel) -{ - struct audio_stream *source = bsource->data; - int num_channels = audio_stream_get_channels(source); - ae_int32 *in = (ae_int32 *)source->r_ptr + source_channel; - ae_int16 *out = (ae_int16 *)buf->w_ptr; - ae_int32x2 sample32; - ae_int16x4 sample; - ae_int32x2 temp; - ae_int16x4 coef; - ae_int16x4 delay; - const int in_inc = sizeof(ae_int32) * num_channels; - const int out_inc = sizeof(ae_int16); - int i; - - set_circular_buf1(buf->addr, buf->end_addr); - set_circular_buf0(source->addr, source->end_addr); - - if (emph->enable) { - delay = emph->delay; - coef = emph->coef; - for (i = 0; i < frames; i++) { - AE_L32_XC(sample32, in, in_inc); - /* Shift left by 8 to sign-extend to Q1.31 */ - sample32 = AE_SLAI32(sample32, 8); - /* Then shift right by 16 to get 16-bit */ - sample32 = AE_SRAI32(sample32, 16); - sample = AE_SAT16X4(sample32, sample32); - /* Q1.15 -> Q1.31 */ - temp = AE_CVT32X2F16_10(sample); - AE_MULAF16SS_00(temp, delay, coef); - delay = sample; - sample = AE_ROUND16X4F32SSYM(temp, temp); - AE_S16_0_XC1(sample, out, out_inc); - } - emph->delay = delay; - } else { - for (i = 0; i < frames; i++) { - AE_L32_XC(sample32, in, in_inc); - /* Shift left by 8 to sign-extend to Q1.31 */ - sample32 = AE_SLAI32(sample32, 8); - /* Then shift right by 16 to get 16-bit */ - sample32 = AE_SRAI32(sample32, 16); - sample = AE_SAT16X4(sample32, sample32); - AE_S16_0_XC1(sample, out, out_inc); - } - } - - buf->s_avail += frames; - buf->s_free -= frames; - buf->w_ptr = (int16_t *)out; -} -#endif /* CONFIG_FORMAT_S24LE */ - -#if CONFIG_FORMAT_S32LE -void mfcc_source_copy_s32(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, - struct mfcc_pre_emph *emph, int frames, int source_channel) -{ - struct audio_stream *source = bsource->data; - int num_channels = audio_stream_get_channels(source); - ae_int32 *in = (ae_int32 *)source->r_ptr + source_channel; - ae_int16 *out = (ae_int16 *)buf->w_ptr; - ae_int32x2 sample32; - ae_int16x4 sample; - ae_int32x2 temp; - ae_int16x4 coef; - ae_int16x4 delay; - const int in_inc = sizeof(ae_int32) * num_channels; - const int out_inc = sizeof(ae_int16); - int i; - - set_circular_buf1(buf->addr, buf->end_addr); - set_circular_buf0(source->addr, source->end_addr); - - if (emph->enable) { - delay = emph->delay; - coef = emph->coef; - for (i = 0; i < frames; i++) { - AE_L32_XC(sample32, in, in_inc); - /* S32: shift right by 16 to get 16-bit */ - sample32 = AE_SRAI32(sample32, 16); - sample = AE_SAT16X4(sample32, sample32); - /* Q1.15 -> Q1.31 */ - temp = AE_CVT32X2F16_10(sample); - AE_MULAF16SS_00(temp, delay, coef); - delay = sample; - sample = AE_ROUND16X4F32SSYM(temp, temp); - AE_S16_0_XC1(sample, out, out_inc); - } - emph->delay = delay; - } else { - for (i = 0; i < frames; i++) { - AE_L32_XC(sample32, in, in_inc); - sample32 = AE_SRAI32(sample32, 16); - sample = AE_SAT16X4(sample32, sample32); - AE_S16_0_XC1(sample, out, out_inc); - } - } - - buf->s_avail += frames; - buf->s_free -= frames; - buf->w_ptr = (int16_t *)out; -} -#endif /* CONFIG_FORMAT_S32LE */ - #endif /* MFCC_HIFI4 */ diff --git a/src/audio/mfcc/mfcc_ipc4.c b/src/audio/mfcc/mfcc_ipc4.c new file mode 100644 index 000000000000..bb20d85e413b --- /dev/null +++ b/src/audio/mfcc/mfcc_ipc4.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: BSD-3-Clause +// +// Copyright(c) 2026 Intel Corporation. +// +// Author: Seppo Ingalsuo + +/** + * \file mfcc_ipc4.c + * \brief IPC4-specific functions for MFCC component. + * + * Provides VAD switch control notification to user space via the + * IPC4 module notification mechanism. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +LOG_MODULE_DECLARE(mfcc, CONFIG_SOF_LOG_LEVEL); + +/** + * \brief Initialize IPC notification message for VAD switch control. + * + * Allocates and configures the IPC message used to send VAD state + * change notifications to user space via a switch control. + */ +int mfcc_ipc_notification_init(struct processing_module *mod) +{ + struct mfcc_comp_data *cd = module_get_private_data(mod); + struct ipc_msg msg_proto; + struct comp_dev *dev = mod->dev; + struct comp_ipc_config *ipc_config = &dev->ipc_config; + union ipc4_notification_header *primary = + (union ipc4_notification_header *)&msg_proto.header; + struct sof_ipc4_notify_module_data *msg_module_data; + struct sof_ipc4_control_msg_payload *msg_payload; + + memset_s(&msg_proto, sizeof(msg_proto), 0, sizeof(msg_proto)); + primary->r.notif_type = SOF_IPC4_MODULE_NOTIFICATION; + primary->r.type = SOF_IPC4_GLB_NOTIFICATION; + primary->r.rsp = SOF_IPC4_MESSAGE_DIR_MSG_REQUEST; + primary->r.msg_tgt = SOF_IPC4_MESSAGE_TARGET_FW_GEN_MSG; + cd->msg = ipc_msg_w_ext_init(msg_proto.header, msg_proto.extension, + sizeof(struct sof_ipc4_notify_module_data) + + sizeof(struct sof_ipc4_control_msg_payload) + + sizeof(struct sof_ipc4_ctrl_value_chan)); + if (!cd->msg) { + comp_err(dev, "Failed to initialize VAD notification"); + return -ENOMEM; + } + + msg_module_data = (struct sof_ipc4_notify_module_data *)cd->msg->tx_data; + msg_module_data->instance_id = IPC4_INST_ID(ipc_config->id); + msg_module_data->module_id = IPC4_MOD_ID(ipc_config->id); + msg_module_data->event_id = SOF_IPC4_NOTIFY_MODULE_EVENTID_ALSA_MAGIC_VAL | + SOF_IPC4_SWITCH_CONTROL_PARAM_ID; + msg_module_data->event_data_size = sizeof(struct sof_ipc4_control_msg_payload) + + sizeof(struct sof_ipc4_ctrl_value_chan); + + msg_payload = (struct sof_ipc4_control_msg_payload *)msg_module_data->event_data; + msg_payload->id = MFCC_CTRL_INDEX_VAD; + msg_payload->num_elems = 1; + msg_payload->chanv[0].channel = 0; + + comp_dbg(dev, "VAD notification init: instance_id = 0x%08x, module_id = 0x%08x", + msg_module_data->instance_id, msg_module_data->module_id); + return 0; +} + +/** + * \brief Send VAD switch control notification to user space. + * \param mod Processing module. + * \param val VAD value: 1 = speech, 0 = silence. + */ +void mfcc_send_vad_notification(struct processing_module *mod, uint32_t val) +{ + struct mfcc_comp_data *cd = module_get_private_data(mod); + struct sof_ipc4_notify_module_data *msg_module_data; + struct sof_ipc4_control_msg_payload *msg_payload; + + if (!cd->msg) + return; + + msg_module_data = (struct sof_ipc4_notify_module_data *)cd->msg->tx_data; + msg_payload = (struct sof_ipc4_control_msg_payload *)msg_module_data->event_data; + msg_payload->chanv[0].value = val; + ipc_msg_send(cd->msg, NULL, false); +} + +int mfcc_get_config(struct processing_module *mod, + uint32_t config_id, uint32_t *data_offset_size, + uint8_t *fragment, size_t fragment_size) +{ + struct sof_ipc_ctrl_data *cdata = (struct sof_ipc_ctrl_data *)fragment; + struct mfcc_comp_data *cd = module_get_private_data(mod); + struct sof_ipc4_control_msg_payload *ctl; + + comp_info(mod->dev, "entry"); + + switch (config_id) { + case SOF_IPC4_SWITCH_CONTROL_PARAM_ID: + ctl = (struct sof_ipc4_control_msg_payload *)fragment; + if (ctl->id == MFCC_CTRL_INDEX_VAD && ctl->num_elems == 1) { + ctl->chanv[0].value = cd->vad_prev ? 1 : 0; + *data_offset_size = sizeof(*ctl) + sizeof(ctl->chanv[0]); + return 0; + } + return -EINVAL; + default: + return comp_data_blob_get_cmd(cd->model_handler, cdata, fragment_size); + } +} + +int mfcc_set_config(struct processing_module *mod, uint32_t config_id, + enum module_cfg_fragment_position pos, uint32_t data_offset_size, + const uint8_t *fragment, size_t fragment_size, uint8_t *response, + size_t response_size) +{ + struct mfcc_comp_data *cd = module_get_private_data(mod); + + comp_info(mod->dev, "entry"); + + switch (config_id) { + case SOF_IPC4_SWITCH_CONTROL_PARAM_ID: + /* VAD switch is read-only, ignore set requests */ + return 0; + default: + return comp_data_blob_set(cd->model_handler, pos, data_offset_size, + fragment, fragment_size); + } +} diff --git a/src/audio/mfcc/mfcc_setup.c b/src/audio/mfcc/mfcc_setup.c index 1cad4b2b984e..cc673d29b0da 100644 --- a/src/audio/mfcc/mfcc_setup.c +++ b/src/audio/mfcc/mfcc_setup.c @@ -18,6 +18,8 @@ #include #include +#include + /* Definitions for cepstral lifter */ #define PI_Q23 Q_CONVERT_FLOAT(3.1415926536, 23) #define TWO_PI_Q23 Q_CONVERT_FLOAT(6.2831853072, 23) @@ -127,6 +129,11 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i return -EINVAL; } + if (sample_rate > MFCC_MAX_SAMPLE_RATE) { + comp_err(dev, "Sample rate %d exceeds max %d Hz", sample_rate, MFCC_MAX_SAMPLE_RATE); + return -EINVAL; + } + if (config->sample_frequency != sample_rate) { comp_err(dev, "Config sample_frequency does not match stream"); return -EINVAL; @@ -328,15 +335,18 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i /* Check that output data can be drained within the periods spanned by one * FFT hop. Each hop consumes fft_hop_size input samples and produces - * max_out_per_hop + 2 (magic) int16_t output values. The sink provides at - * least fft_hop_size * channels int16_t samples per hop (worst case s16). + * max_out_per_hop + header int32_t output values. The sink provides + * at least fft_hop_size * channels int32_t samples per hop (worst case s32). * If output exceeds this, data accumulates and will eventually overflow. + * This check is not needed in compress output mode where only actual data + * bytes are committed without zero padding. */ - int out_per_hop = max_out_per_hop + 2; + int out_per_hop = max_out_per_hop + sizeof(state->header) / sizeof(int32_t); int sink_per_hop = fft->fft_hop_size * channels; + bool skip_size_check = config->compress_output; - if (out_per_hop > sink_per_hop) { - comp_err(dev, "Output %d int16 per hop exceeds sink capacity %d (hop %d x ch %d)", + if (!skip_size_check && out_per_hop > sink_per_hop) { + comp_err(dev, "Output %d int32 per hop exceeds sink capacity %d (hop %d x ch %d)", out_per_hop, sink_per_hop, fft->fft_hop_size, channels); ret = -EINVAL; goto free_lifter; @@ -345,10 +355,24 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i /* Set initial state for STFT */ state->waiting_fill = true; state->prev_samples_valid = false; - state->magic_pending = false; + state->header_pending = false; + state->hop_count = 0; + memset(&state->header, 0, sizeof(state->header)); + state->header.magic = MFCC_MAGIC; state->out_data_ptr = NULL; - state->out_data_ptr_32 = NULL; state->out_remain = 0; + state->vad_silence_count = 0; + state->dtx_trailing_silence = config->dtx_trailing_silence_hops; + state->dtx_silence_interval = config->dtx_silence_hops_interval; + state->dtx_silence_counter = 0; + + if (config->enable_vad) { + ret = mfcc_vad_init(&cd->vad, config->num_mel_bins, sample_rate, mod); + if (ret < 0) { + comp_err(dev, "Failed VAD init"); + goto free_lifter; + } + } comp_dbg(dev, "done"); return 0; @@ -378,15 +402,27 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i return ret; } +static void mfcc_free_and_null(struct processing_module *mod, void **ptr) +{ + mod_free(mod, *ptr); + *ptr = NULL; +} + +/* Free MFCC buffers to prevent leaks on reset->prepare cycles. + * mfcc_free_buffers() NULLs the pointers after free. + */ void mfcc_free_buffers(struct processing_module *mod) { struct mfcc_comp_data *cd = module_get_private_data(mod); mod_fft_plan_free(mod, cd->state.fft.fft_plan); - mod_free(mod, cd->state.fft.fft_buf); - mod_free(mod, cd->state.fft.fft_out); - mod_free(mod, cd->state.buffers); - mod_free(mod, cd->state.melfb.data); - mod_free(mod, cd->state.dct.matrix); - mod_free(mod, cd->state.lifter.matrix); + cd->state.fft.fft_plan = NULL; + mfcc_free_and_null(mod, (void **)&cd->state.fft.fft_buf); + mfcc_free_and_null(mod, (void **)&cd->state.fft.fft_out); + mfcc_free_and_null(mod, (void **)&cd->state.buffers); + mfcc_free_and_null(mod, (void **)&cd->state.melfb.data); + mfcc_free_and_null(mod, (void **)&cd->state.dct.matrix); + mfcc_free_and_null(mod, (void **)&cd->state.lifter.matrix); + mfcc_free_and_null(mod, (void **)&cd->vad.noise_floor); + mfcc_free_and_null(mod, (void **)&cd->vad.weights); } diff --git a/src/audio/mfcc/mfcc_vad.c b/src/audio/mfcc/mfcc_vad.c new file mode 100644 index 000000000000..f44a89a7dea3 --- /dev/null +++ b/src/audio/mfcc/mfcc_vad.c @@ -0,0 +1,229 @@ +// SPDX-License-Identifier: BSD-3-Clause +// +// Copyright(c) 2026 Intel Corporation. +// +// Author: Seppo Ingalsuo + +/** + * \file mfcc_vad.c + * \brief Voice Activity Detection based on Mel spectrum energy. + * + * Implements a VAD that tracks per-bin noise floor and computes a + * speech-frequency weighted energy above the floor. Speech is declared + * when the weighted delta exceeds a threshold, with hangover to prevent + * rapid toggling. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include + +LOG_MODULE_DECLARE(mfcc, CONFIG_SOF_LOG_LEVEL); + +/** + * \brief A-weighting table: 1/3 octave band center frequencies in Hz (Q16.0). + * + * From IEC 61672-1:2013, source: + * https://acousticalengineer.com/a-weighting-table/ + */ +#define A_WEIGHT_TABLE_SIZE 36 + +static const int16_t a_weight_hz[A_WEIGHT_TABLE_SIZE] = { + 6, 8, 10, 13, 16, 20, 25, 32, + 40, 50, 63, 80, 100, 125, 160, 200, + 250, 315, 400, 500, 630, 800, 1000, 1250, + 1600, 2000, 2500, 3150, 4000, 5000, 6300, 8000, + 10000, 12500, 16000, 20000, +}; + +/** + * \brief A-weighting linear amplitude, scaled so peak (at 2500 Hz) maps + * to INT16_MAX (32767). Original dB values converted via + * 10^(dB/20) then scaled by 32767 / max. + */ +static const int16_t a_weight_lin[A_WEIGHT_TABLE_SIZE] = { + 2, 4, 9, 19, 43, 85, 162, 299, + 531, 862, 1382, 2140, 3129, 4370, 6172, 8136, + 10362, 13196, 16234, 19518, 22669, 25730, 28212, 30230, + 31655, 32392, 32767, 32392, 31655, 30230, 27889, 24856, + 21156, 17196, 13045, 9670, +}; + +/** + * \brief Compute A-weighted speech-frequency emphasis weights for Mel bins. + * + * Weights are computed by linearly interpolating the A-weighting table + * at each Mel bin center frequency. Output weights are in Q1.15 and + * sum to approximately 2^15. + * + * \param[out] weights Output weight array. + * \param[in] num_mel Number of Mel bins. + * \param[in] sample_rate Sample rate in Hz. + */ +static void mfcc_vad_build_weights(int16_t *weights, int num_mel, int32_t sample_rate) +{ + int32_t scaled, num; + int32_t sum = 0; + int16_t f_hz, f0, f1, w, w0, w1, den; + int16_t mel_end = psy_hz_to_mel((int16_t)(sample_rate / 2)); /* Nyquist (max 32767 Hz) in Mel */ + int16_t mel_step = mel_end / (num_mel + 1); + int i, j; + + if (!num_mel) + return; + + for (i = 0; i < num_mel; i++) { + f_hz = psy_mel_to_hz((int16_t)((i + 1) * mel_step)); + + /* Find the table interval containing f_hz and interpolate */ + if (f_hz <= a_weight_hz[0]) { + w = a_weight_lin[0]; + } else if (f_hz >= a_weight_hz[A_WEIGHT_TABLE_SIZE - 1]) { + w = a_weight_lin[A_WEIGHT_TABLE_SIZE - 1]; + } else { + /* Find j such that a_weight_hz[j] <= f_hz < a_weight_hz[j+1] */ + for (j = 0; j < A_WEIGHT_TABLE_SIZE - 2; j++) { + if (f_hz < a_weight_hz[j + 1]) + break; + } + + /* Linear interpolation: w = w0 + (w1 - w0) * (f - f0) / (f1 - f0) */ + f0 = a_weight_hz[j]; + f1 = a_weight_hz[j + 1]; + w0 = a_weight_lin[j]; + w1 = a_weight_lin[j + 1]; + num = (int32_t)(w1 - w0) * (f_hz - f0); + den = f1 - f0; + w = w0 + (int16_t)(num / den); + } + + weights[i] = w; + sum += w; + } + + /* Normalize weights so they sum to 1.0 */ + for (i = 0; i < num_mel; i++) { + scaled = ((int32_t)weights[i] << 16) / sum; /* Q1.16 */ + weights[i] = (int16_t)Q_SHIFT_RND(scaled, 16, 15); /* Round to Q1.15 */ + } +} + +int mfcc_vad_init(struct mfcc_vad_state *vad, int num_mel_bins, int32_t sample_rate, + struct processing_module *mod) +{ + if (!vad) + return -EINVAL; + + if (num_mel_bins <= 0) + return -EINVAL; + + vad->num_mel_bins = num_mel_bins; + vad->energy_threshold = MFCC_VAD_ENERGY_THRESHOLD; + vad->noise_rise_alpha_slow = MFCC_VAD_NOISE_RISE_ALPHA; + vad->noise_rise_alpha_fast = MFCC_VAD_NOISE_RISE_ALPHA_FAST; + vad->hangover_max = MFCC_VAD_HANGOVER_FRAMES; + vad->hangover_counter = 0; + vad->init_frames = MFCC_VAD_NOISE_INIT_FRAMES; + vad->frame_count = 0; + vad->is_speech = false; + vad->initialized = false; + + /* Allocate per-bin noise floor */ + vad->noise_floor = mod_zalloc(mod, num_mel_bins * sizeof(int32_t)); + if (!vad->noise_floor) + return -ENOMEM; + + /* Allocate and compute per-bin weights */ + vad->weights = mod_zalloc(mod, num_mel_bins * sizeof(int16_t)); + if (!vad->weights) { + mod_free(mod, vad->noise_floor); + vad->noise_floor = NULL; + return -ENOMEM; + } + + mfcc_vad_build_weights(vad->weights, num_mel_bins, sample_rate); + return 0; +} + +int mfcc_vad_update(struct mfcc_vad_state *vad, const int32_t *mel_log) +{ + int64_t signal_energy = 0; + int64_t noise_energy = 0; + int64_t energy_delta = 0; + int32_t delta; + int32_t p; + int16_t alpha; + int i; + + if (!vad || !mel_log) + return 0; + + /* Stop incrementing after init phase to avoid wrap-around restarting fast alpha. + * Select rise alpha based on convergence phase. + */ + if (vad->frame_count < vad->init_frames) { + vad->frame_count++; + alpha = vad->noise_rise_alpha_fast; + } else { + alpha = vad->noise_rise_alpha_slow; + } + + /* Initialize noise floor to first frame */ + if (!vad->initialized) { + for (i = 0; i < vad->num_mel_bins; i++) + vad->noise_floor[i] = mel_log[i]; + + vad->initialized = true; + } + + /* Update noise floor: follow down instantly, rise slowly */ + for (i = 0; i < vad->num_mel_bins; i++) { + if (mel_log[i] < vad->noise_floor[i]) { + /* Instant follow-down */ + vad->noise_floor[i] = mel_log[i]; + } else { + /* Slow rise: floor += alpha * (mel - floor) + * Q9.23 + Q1.15 * Q9.23 => need Q9.23 result + * alpha is Q1.15, delta is Q9.23 + */ + delta = mel_log[i] - vad->noise_floor[i]; + p = (int32_t)Q_MULTSR_32X32((int64_t)alpha, delta, 15, 23, 23); + vad->noise_floor[i] += p; + } + } + + /* Compute weighted signal energy and noise floor energy. + * weights are Q1.15, mel values are Q9.23 + * Products are Q10.38, accumulate in int64_t then shift to Q9.23 + */ + + for (i = 0; i < vad->num_mel_bins; i++) { + signal_energy += (int64_t)vad->weights[i] * mel_log[i]; + noise_energy += (int64_t)vad->weights[i] * vad->noise_floor[i]; + } + + vad->energy = sat_int32(Q_SHIFT_RND(signal_energy, 38, 23)); + vad->noise_energy = sat_int32(Q_SHIFT_RND(noise_energy, 38, 23)); + energy_delta = vad->energy - vad->noise_energy; + + /* Round accumulated energy from Q10.38 to Q9.23, saturate to int32 */ + if (energy_delta > vad->energy_threshold) { + vad->hangover_counter = vad->hangover_max; + vad->is_speech = true; + } else { + if (vad->hangover_counter > 0) { + vad->hangover_counter--; + vad->is_speech = true; + } else { + vad->is_speech = false; + } + } + + return vad->is_speech ? 1 : 0; +} diff --git a/src/audio/mfcc/tune/README.md b/src/audio/mfcc/tune/README.md new file mode 100644 index 000000000000..f825afb758a6 --- /dev/null +++ b/src/audio/mfcc/tune/README.md @@ -0,0 +1,189 @@ +# SOF MFCC Tuning Tools + +This directory contains a tool to create configuration blob for SOF +MFCC component. It's simply run in Matlab or Octave with command +`setup_mfcc`. The MFCC configuration parameters can be edited from the +script. + +## Testbench + +The configuration can be test run with testbench. First the test topologies +need to be created with `scripts/build-tools.sh -t`. Next the testbench +is built with `scripts/rebuild-testbench.sh`. + +Once the previous steps are done, a sample wav file can be processed +with script `run_mfcc.sh`. The script converts the input to raw 16 kHz +stereo format and runs the testbench for S16, S24, and S32 bit depths, +producing both cepstral coefficient (MFCC) and Mel spectrogram outputs. + +``` +./run_mfcc.sh /usr/share/sounds/alsa/Front_Center.wav +``` + +Output files from host testbench: + +| File | Content | +|------|---------| +| `mfcc_s16.raw`, `mfcc_s24.raw`, `mfcc_s32.raw` | Cepstral coefficients | +| `mel_s16.raw`, `mel_s24.raw`, `mel_s32.raw` | Mel spectrogram | + +If the `XTENSA_PATH` environment variable is set, the script also runs +the Xtensa build of the testbench (via `xt-run`) and produces additional +output files prefixed with `xt_`: + +| File | Content | +|------|---------| +| `xt_mfcc_s16.raw`, `xt_mfcc_s24.raw`, `xt_mfcc_s32.raw` | Cepstral coefficients | +| `xt_mel_s16.raw`, `xt_mel_s24.raw`, `xt_mel_s32.raw` | Mel spectrogram | + +## Decoding and Plotting + +All output files can be decoded and plotted at once in Matlab or Octave +with the `decode_all.m` script: + +```matlab +decode_all +``` + +This calls `decode_ceps` for each MFCC file (13 cepstral coefficients) and +`decode_mel` for each Mel file (80 Mel bins), plotting spectrograms for all +files that exist including the Xtensa variants. + +Individual files can also be decoded manually: + +```matlab +[ceps, t, n] = decode_ceps('mfcc_s16.raw', 13); +``` + +In the above it's known from configuration script that MFCC was set up to +output 13 cepstral coefficients from each FFT → Mel → DCT → Cepstral +coefficients computation run. + +The 80 bands Mel output can be visualized with command: + +```matlab +[mel, t, n] = decode_mel('mel_s16.raw', 80); +``` + +## Live Whisper Transcription with DSP VAD + +The directory contains a Python script `sof_mel_to_text_live_dsp_vad.py`. +It can be used with development topologies +`sof-arl-cs42l43-l0-cs35l56-l23-mfcc.tplg` and +`sof-mtl-rt713-l0-rt1316-l12-mfcc.tplg`. It captures from default audio +device `hw:0,47` (headset microphone) Mel audio features and VAD flags. +The captured frames with detected speech are sent to Whisper speech +recognizer model for conversion to text. + +### Prerequisites + +The script needs OpenVINO. Please follow the install procedure from +. + +The following Python pip installs are needed into the same OpenVINO venv: + +```bash +pip install openvino openvino-tokenizers openvino-genai +pip install optimum[intel] +pip install transformers +pip install huggingface_hub +``` + +### NPU / GPU Support + +The script by default runs the Whisper encoder model in the NPU. To +use the NPU, install the driver from +. If the NPU is not +available, change the encoder to CPU with run option `--encoder-device CPU`. +With a GPU both `--encoder-device GPU` and `--decoder-device GPU` can be set. + +### Example run + +Check which capture devices are available. + +```bash +arecord -l +``` + +In this example the devices hw:0,47 and hw:0,48 support the audio +features stream. + +```bash +**** List of CAPTURE Hardware Devices **** +card 0: sofsoundwire [sof-soundwire], device 1: Jack In (*) [] + Subdevices: 1/1 + Subdevice #0: subdevice #0 +card 0: sofsoundwire [sof-soundwire], device 4: Microphone (*) [] + Subdevices: 1/1 + Subdevice #0: subdevice #0 +card 0: sofsoundwire [sof-soundwire], device 47: Jack In Audio Features (*) [] + Subdevices: 1/1 + Subdevice #0: subdevice #0 +card 0: sofsoundwire [sof-soundwire], device 48: Microphone Audio Features (*) [] + Subdevices: 1/1 + Subdevice #0: subdevice #0 +``` + +With Whisper model run the CPU and with internal microphones of laptop +the run command is: + +```bash +python3 sof_mel_to_text_live_dsp_vad.py --encoder-device CPU --device hw:0,48 +``` + +The script run output is shown below + +```bash +=== Live SOF Mel → Whisper Transcription (DSP VAD) === + +Starting capture: arecord -D hw:0,48 -f S32_LE -c 2 -r 16000 -t raw --buffer-size 8192 +VAD source: DSP (embedded in stream) +Silence trigger: 100ms (10 frames) +Whisper model: whisper-medium-int4-ov (encoder: CPU, decoder: CPU) + + [ 0.01s] SILENCE + [ 1.39s] SPEECH + [ 2.57s] SILENCE + [ 2.66s] Transcribing 118 frames (1.2s)... + [Whisper] encoder: 1.30s + [Whisper] decoder: 0.59s (3 tokens) + + >> "Hello computer" +``` + +## Live Spectrogram Viewers + +### Mel Spectrogram + +The `sof_mel_spectrogram_compress.py` script captures Mel spectrogram +frames from a SOF compress PCM device and displays them as a live +scrolling spectrogram with VAD status. This is a lightweight viewer +that does not run Whisper inference. + +```bash +python3 sof_mel_spectrogram_compress.py --card 0 --device 48 --width 300 +``` + +### Cepstral Spectrogram + +The `sof_ceps_spectrogram_compress.py` script is similar but displays +cepstral coefficients (MFCC) instead of Mel bands. + +```bash +python3 sof_ceps_spectrogram_compress.py --card 0 --device 48 --num-ceps 13 --width 300 +``` + +## Live Whisper Transcription with Compress PCM + +The `sof_mel_to_text_live_compress.py` script captures Mel spectrogram +frames from a SOF compress PCM device and performs live Whisper +transcription using OpenVINO. Unlike `sof_mel_to_text_live_dsp_vad.py` +which uses `arecord`, this script reads directly from the compress PCM +device with DTX-aware frame handling. + +```bash +python3 sof_mel_to_text_live_compress.py --card 0 --device 48 --model whisper-medium-int4-ov +``` + +The same OpenVINO prerequisites and pip packages apply as described above +for `sof_mel_to_text_live_dsp_vad.py`. diff --git a/src/audio/mfcc/tune/README.txt b/src/audio/mfcc/tune/README.txt deleted file mode 100644 index a0c3189e81a3..000000000000 --- a/src/audio/mfcc/tune/README.txt +++ /dev/null @@ -1,52 +0,0 @@ -This directory contains a tool to create configuration blob for SOF -MFCC component. It's simply run in Matlab or Octave with command -"setup_mfcc". The MFCC configuration parameters can be edited from the -script. - -The configuration can be test run with testbench. First the test topologies -need to be created with "scripts/build-tools.sh -t". Next the testbench -is build with "scripts/rebuild-testbench.sh". - -Once the previous steps are done, a sample wav file can be processed -with script run_mfcc.sh. The script converts the input to raw 16 kHz -stereo format and runs the testbench for S16, S24, and S32 bit depths, -producing both cepstral coefficient (MFCC) and Mel spectrogram outputs. - -./run_mfcc.sh /usr/share/sounds/alsa/Front_Center.wav - -Output files from host testbench: - mfcc_s16.raw, mfcc_s24.raw, mfcc_s32.raw - cepstral coefficients - mel_s16.raw, mel_s24.raw, mel_s32.raw - Mel spectrogram - -If the XTENSA_PATH environment variable is set, the script also runs -the Xtensa build of the testbench (via xt-run) and produces additional -output files prefixed with "xt_": - xt_mfcc_s16.raw, xt_mfcc_s24.raw, xt_mfcc_s32.raw - xt_mel_s16.raw, xt_mel_s24.raw, xt_mel_s32.raw - -All output files can be decoded and plotted at once in Matlab or Octave -with the decode_all.m script: - -decode_all - -This calls decode_ceps for each MFCC file (13 cepstral coefficients) and -decode_mel for each Mel file (80 Mel bins), plotting spectrograms for all -files that exist including the Xtensa variants. - -Individual files can also be decoded manually: - -[ceps, t, n] = decode_ceps('mfcc_s16.raw', 13); - -In the above it's known from configuration script that MFCC was set up to -output 13 cepstral coefficients from each FFT -> Mel -> DCT -> Cepstral -coefficients computation run. - -The 80 bands Mel output can be visualized with command: - -[mel, t, n] = decode_mel('mel_s16.raw', 80); - -Other kind of signals have quite big visual difference in audio features. Try -e.g. other sound files found in computer. - -./run_mfcc.sh /usr/share/sounds/gnome/default/alerts/bark.ogg -./run_mfcc.sh /usr/share/sounds/gnome/default/alerts/sonar.ogg diff --git a/src/audio/mfcc/tune/decode_all.m b/src/audio/mfcc/tune/decode_all.m index d5b60289b4cf..4c377bf5029a 100644 --- a/src/audio/mfcc/tune/decode_all.m +++ b/src/audio/mfcc/tune/decode_all.m @@ -6,12 +6,11 @@ num_ceps = 13; num_mel = 80; -% MFCC cepstral output files +% MFCC cepstral output files (all int32 output, Q9.23) ceps_files = {'mfcc_s16.raw', 'mfcc_s24.raw', 'mfcc_s32.raw'}; -% Mel output files with corresponding format +% Mel output files (all int32 output, Q9.23) mel_files = {'mel_s16.raw', 'mel_s24.raw', 'mel_s32.raw'}; -mel_fmts = {'s16', 's24', 's32'}; % Xtensa prefixed variants xt_ceps_files = {'xt_mfcc_s16.raw', 'xt_mfcc_s24.raw', 'xt_mfcc_s32.raw'}; @@ -19,21 +18,21 @@ all_ceps_files = [ceps_files, xt_ceps_files]; all_mel_files = [mel_files, xt_mel_files]; -all_mel_fmts = [mel_fmts, mel_fmts]; for i = 1:length(all_ceps_files) fn = all_ceps_files{i}; if exist(fn, 'file') fprintf('Decoding MFCC ceps: %s\n', fn); - [ceps, t, n] = decode_ceps(fn, num_ceps); + [ceps, t, n, vad, energy, noise_energy, frame_num] = ... + decode_ceps(fn, num_ceps); end end for i = 1:length(all_mel_files) fn = all_mel_files{i}; - fmt = all_mel_fmts{i}; if exist(fn, 'file') fprintf('Decoding Mel: %s\n', fn); - [mel, t, n] = decode_mel(fn, num_mel, fmt); + [mel, t, n, vad, energy, noise_energy, frame_num] = ... + decode_mel(fn, num_mel); end end diff --git a/src/audio/mfcc/tune/decode_ceps.m b/src/audio/mfcc/tune/decode_ceps.m index a63677fa3731..480eadea2945 100644 --- a/src/audio/mfcc/tune/decode_ceps.m +++ b/src/audio/mfcc/tune/decode_ceps.m @@ -1,57 +1,112 @@ -% [ceps, t, n] = decode_ceps(fn, num_ceps, num_channels) +% [ceps, t, n, vad, energy, noise_energy, frame_number] = decode_ceps(fn, num_ceps, hop, num_channels) % % Input % fn - File with MFCC data in .raw or .wav format % num_ceps - number of cepstral coefficients per frame -% num_channels - needed for .raw format, omit for .wav +% hop - STFT hop in seconds, defaults to 10e-3 for 10 ms +% num_channels - needed for .raw format, omit for .wav, default 1 % % Outputs % ceps - cepstral coefficients % t - time vector for plotting % n - ceps 1..num_ceps vector for plotting +% vad - VAD flag per frame from DSP +% energy - weighted signal energy per frame from DSP +% noise_energy - weighted noise floor energy per frame from DSP +% frame_number - frame number from DSP % SPDX-License-Identifier: BSD-3-Clause -% Copyright(c) 2022 Intel Corporation. All rights reserved. +% Copyright(c) 2022-2026 Intel Corporation. All rights reserved. -function [ceps, t, n] = decode_ceps(fn, num_ceps, num_channels) +function [ceps, t, n, vad, energy, noise_energy, frame_number] = ... + decode_ceps(fn, num_ceps, hop, num_channels) if nargin < 3 + hop = 10e-3; +end +if nargin < 4 num_channels = 1; end % MFCC stream -fs = 16e3; -qformat = 7; -magic = [25443 28006]; % ASCII 'mfcc' as int16 +qformat = 23; % Q9.23 in int32 +magic = int32(1835426659); % 0x6D666363 as int32 +num_magic = 1; % magic word is 1 x int32 -% Load output data +% Load output data (always int32) [data, num_channels] = get_file(fn, num_channels); -idx1 = find(data == magic(1)); -idx = []; -for i = 1:length(idx1) - if data(idx1(i) + 1) == magic(2) - idx = [idx idx1(i)]; - end -end +idx = find(data == magic); if isempty(idx) error('No magic value markers found from stream'); end -period_ceps = idx(2)-idx(1); num_frames = length(idx); -t_ceps = period_ceps / num_channels / fs; -t = (0:num_frames -1) * t_ceps; -n = 1:num_ceps; -ceps = zeros(num_ceps, num_frames); +% Header after magic is [frame_number, reserved, energy, noise_energy, vad_flag] +% as int32, followed by num_ceps coefficients (int32). +payload_len = 5 + num_ceps; + +% Last frame can be incomplete due to span over multiple periods +last = idx(end) + num_magic + payload_len - 1; +if (last > length(data)) + num_frames = num_frames - 1; +end + +payload = zeros(payload_len, num_frames); for i = 1:num_frames - i1 = idx(i) + 2; - i2 = i1 + num_ceps - 1; - ceps(:,i) = data(i1:i2) / 2^qformat; + i1 = idx(i) + num_magic; + i2 = i1 + payload_len - 1; + payload(:,i) = double(data(i1:i2)); end +frame_number = payload(1, :); +% payload(2,:) is reserved, skip +energy = payload(3, :) / 2^23; +noise_energy = payload(4, :) / 2^23; +vad = payload(5, :); +ceps = payload(6:payload_len, :) / 2^qformat; + +% Fill gaps from DTX-suppressed VAD=0 frames to create continuous timeline. +% Missing frames are filled with the minimum ceps value found in the data. +first_frame = frame_number(1); +last_frame = frame_number(end); +total_frames = last_frame - first_frame + 1; +if total_frames > num_frames + ceps_fill = min(ceps(:)); + ceps_full = ones(num_ceps, total_frames) * ceps_fill; + vad_full = zeros(1, total_frames); + energy_full = zeros(1, total_frames); + noise_energy_full = zeros(1, total_frames); + frame_number_full = first_frame:last_frame; + has_data = false(1, total_frames); + for i = 1:num_frames + fi = frame_number(i) - first_frame + 1; + ceps_full(:, fi) = ceps(:, i); + vad_full(fi) = vad(i); + energy_full(fi) = energy(i); + noise_energy_full(fi) = noise_energy(i); + has_data(fi) = true; + end + % Forward-fill gaps with last received values + for fi = 2:total_frames + if ~has_data(fi) + ceps_full(:, fi) = ceps_full(:, fi - 1); + energy_full(fi) = energy_full(fi - 1); + noise_energy_full(fi) = noise_energy_full(fi - 1); + end + end + ceps = ceps_full; + vad = vad_full; + energy = energy_full; + noise_energy = noise_energy_full; + frame_number = frame_number_full; +end + +t = (frame_number - first_frame) * hop; +n = 1:num_ceps; + figure; surf(t, n, ceps, 'EdgeColor', 'none'); colormap(jet); @@ -70,18 +125,18 @@ switch lower(ext) case '.raw' fh = fopen(fn, 'r'); - data = fread(fh, 'int16'); + data = fread(fh, 'int32'); fclose(fh); case '.wav' tmp = audioread(fn, 'native'); t = whos('tmp'); - if ~strcmp(t.class, 'int16'); - error('Only 16-bit wav file format is supported'); + if ~strcmp(t.class, 'int32') + error('Expected 32-bit wav for int32 MFCC output format'); end s = size(tmp); num_channels = s(2); if num_channels > 1 - data = int16(zeros(prod(s), 1)); + data = int32(zeros(prod(s), 1)); for i = 1:num_channels data(i:num_channels:end) = tmp(:, i); end diff --git a/src/audio/mfcc/tune/decode_mel.m b/src/audio/mfcc/tune/decode_mel.m index f6a723aa2040..0aca1e35ec8d 100644 --- a/src/audio/mfcc/tune/decode_mel.m +++ b/src/audio/mfcc/tune/decode_mel.m @@ -1,23 +1,28 @@ -% [mel, t, n] = decode_mel(fn, num_mel, fmt, num_channels) +% [mel, t, n, vad, energy, noise_energy, frame_number] = decode_mel(fn, num_mel, hop, num_channels) % % Input % fn - File with Mel data in .raw or .wav format +% hop - STFT hop in seconds, defaults to 10e-3 for 10 ms % num_mel - number of Mel coefficients per frame -% fmt - format of the Mel data ('s16', 's24', 's32') -% num_channels - needed for .raw format, omit for .wav +% num_channels - needed for .raw format, omit for .wav, default 1 % % Outputs % mel - Mel coefficients % t - time vector for plotting % n - mel 1..num_mel vector for plotting +% vad - VAD flag per frame from DSP +% energy - weighted signal energy per frame from DSP +% noise_energy - weighted noise floor energy per frame from DSP +% frame_number - frame number from DSP % SPDX-License-Identifier: BSD-3-Clause % Copyright(c) 2026 Intel Corporation. -function [mel, t, n] = decode_mel(fn, num_mel, fmt, num_channels) +function [mel, t, n, vad, energy, noise_energy, frame_number] = ... + decode_mel(fn, num_mel, hop, num_channels) if nargin < 3 - fmt = 's16'; + hop = 10e-3; end if nargin < 4 num_channels = 1; @@ -25,42 +30,21 @@ % MFCC stream fs = 16e3; +qformat = 23; % Q9.23 in int32 -switch fmt - case 's16' - qformat = 7; - magic = [25443 28006]; % ASCII 'mfcc' as two int16 - num_magic = 2; - case 's24' - qformat = 15; - magic = int32(1835426659); % 0x6D666363 as int32 - num_magic = 1; - case 's32' - qformat = 23; - magic = int32(1835426659); % 0x6D666363 as int32 - num_magic = 1; - otherwise - error("Use 's16', 's24', or 's32' as format."); -end +magic = int32(1835426659); % 0x6D666363 as int32 +num_magic = 1; % magic word is 1 x int32 +num_other_header = 5; % frame_number, reserved, energy, noise, vad (all int32) -% Load output data -[data, num_channels] = get_file(fn, num_channels, fmt); - -if strcmp(fmt, 's16') - idx1 = find(data == magic(1)); - idx = []; - for i = 1:length(idx1) - next_word = idx1(i) + 1; - if next_word <= length(data) - if data(next_word) == magic(2) - idx = [idx idx1(i)]; - end - end - end -else - idx = find(data == magic); +% Load output data (always int32) +[data, num_channels] = get_file(fn, num_channels); + +if isempty(data) + error('File %s is empty', fn); end +idx = find(data == magic); + if isempty(idx) error('No magic value markers found from stream'); end @@ -68,65 +52,112 @@ period_mel = idx(2)-idx(1); num_frames = length(idx); +% Header after magic is [frame_number, reserved, energy, noise_energy, vad_flag] +% as int32, followed by num_mel coefficients. +% For s16 each int32 occupies 2 int16 slots. +payload_len = num_other_header + num_mel; + % Last frame can be incomplete due to span over multiple periods -last = idx(end) + num_mel - 1; +last = idx(end) + num_magic + payload_len - 1; if (last > length(data)) num_frames = num_frames - 1; end -t_mel = period_mel / num_channels / fs; -t = (0:num_frames -1) * t_mel; -n = 1:num_mel; - -mel = zeros(num_mel, num_frames); +payload = zeros(payload_len, num_frames); for i = 1:num_frames i1 = idx(i) + num_magic; - i2 = i1 + num_mel - 1; - mel(:,i) = double(data(i1:i2)) / 2^qformat; + i2 = i1 + payload_len - 1; + payload(:,i) = double(data(i1:i2)); end -figure; +frame_number = payload(1, :); +% payload(2,:) is reserved, skip +energy = payload(3, :) / 2^23; +noise_energy = payload(4, :) / 2^23; +vad = payload(5, :); +mel = payload(6:payload_len, :) / 2^qformat; + +% Fill gaps from DTX-suppressed VAD=0 frames to create continuous timeline. +% Missing frames are filled with the minimum Mel value found in the data. +first_frame = frame_number(1); +last_frame = frame_number(end); +total_frames = last_frame - first_frame + 1; +if total_frames > num_frames + mel_fill = min(mel(:)); + mel_full = ones(num_mel, total_frames) * mel_fill; + vad_full = zeros(1, total_frames); + energy_full = zeros(1, total_frames); + noise_energy_full = zeros(1, total_frames); + frame_number_full = first_frame:last_frame; + has_data = false(1, total_frames); + for i = 1:num_frames + fi = frame_number(i) - first_frame + 1; + mel_full(:, fi) = mel(:, i); + vad_full(fi) = vad(i); + energy_full(fi) = energy(i); + noise_energy_full(fi) = noise_energy(i); + has_data(fi) = true; + end + % Forward-fill gaps with last received values + for fi = 2:total_frames + if ~has_data(fi) + mel_full(:, fi) = mel_full(:, fi - 1); + energy_full(fi) = energy_full(fi - 1); + noise_energy_full(fi) = noise_energy_full(fi - 1); + end + end + mel = mel_full; + vad = vad_full; + energy = energy_full; + noise_energy = noise_energy_full; + frame_number = frame_number_full; +end + +t = (frame_number - first_frame) * hop; +n = 1:num_mel; + +figure imagesc(t, n, mel); axis xy; colormap(jet); colorbar; tstr = sprintf('SOF MFCC Mel coefficients (%s)', fn); title(tstr, 'Interpreter', 'None'); -xlabel('Time (s)'); ylabel('Mel coef #'); +figure +subplot(2,1,1); +plot(t, vad) +ax = axis(); +axis([ax(1:2) -0.1 1.1]); +grid on; +title(tstr, 'Interpreter', 'None'); +xlabel('Time (s)'); +ylabel('VAD flag'); + +subplot(2,1,2); +plot(t, energy, t, noise_energy); +grid on; +legend('Energy', 'Noise Energy'); +xlabel('Time (s)'); +ylabel('Energy'); + end -function [data, num_channels] = get_file(fn, num_channels, fmt) +function [data, num_channels] = get_file(fn, num_channels) [~, ~, ext] = fileparts(fn); -switch fmt - case 's16' - read_fmt = 'int16'; - case {'s24', 's32'} - read_fmt = 'int32'; - otherwise - error("Use 's16', 's24', or 's32' as format."); -end - switch lower(ext) case '.raw' fh = fopen(fn, 'r'); - data = fread(fh, read_fmt); + data = fread(fh, 'int32'); fclose(fh); case '.wav' tmp = audioread(fn, 'native'); t = whos('tmp'); - switch fmt - case 's16' - if ~strcmp(t.class, 'int16') - error('Expected 16-bit wav for s16 format'); - end - case {'s24', 's32'} - if ~strcmp(t.class, 'int32') - error('Expected 32-bit wav for %s format', fmt); - end + if ~strcmp(t.class, 'int32') + error('Expected 32-bit wav for int32 MFCC output format'); end s = size(tmp); num_channels = s(2); diff --git a/src/audio/mfcc/tune/setup_mfcc.m b/src/audio/mfcc/tune/setup_mfcc.m index bd2b3f11e60b..dbf69587a74f 100644 --- a/src/audio/mfcc/tune/setup_mfcc.m +++ b/src/audio/mfcc/tune/setup_mfcc.m @@ -25,6 +25,32 @@ function setup_mfcc() setup.tplg_fn = 'mel80.conf'; export_mfcc_setup(gen_cfg, setup); + % Blob for mel spectrogram with compress PCM output + setup = get_mel_spectrogram_config(); + setup.compress_output = true; + setup.tplg_fn = 'mel80_compress.conf'; + export_mfcc_setup(gen_cfg, setup); + + % Blob for mel spectrogram with compress PCM output and DTX + setup = get_mel_spectrogram_config(); + setup.compress_output = true; + setup.enable_dtx = true; + setup.dtx_trailing_silence_hops = 20; + setup.dtx_silence_hops_interval = 500; + setup.tplg_fn = 'mel80_compress_dtx.conf'; + export_mfcc_setup(gen_cfg, setup); + + % Default MFCC (cepstral) with compress PCM output + setup = get_mfcc_default_config(); + setup.compress_output = true; + setup.enable_vad = true; + setup.enable_dtx = true; + setup.dtx_trailing_silence_hops = 20; + setup.dtx_silence_hops_interval = 500; + setup.update_controls = true; + setup.tplg_fn = 'ceps13_compress_dtx.conf'; + export_mfcc_setup(gen_cfg, setup); + end function cfg = get_mfcc_default_config() @@ -62,6 +88,12 @@ function setup_mfcc() cfg.mmax_init = 0; % same cfg.mmax_coef = 0; % same cfg.dynamic_mmax = false; % same + cfg.enable_vad = false; + cfg.enable_dtx = false; + cfg.dtx_trailing_silence_hops = 0; + cfg.dtx_silence_hops_interval = 0; + cfg.update_controls = false; + cfg.compress_output = false; end function cfg = get_mel_spectrogram_config() @@ -99,6 +131,12 @@ function setup_mfcc() cfg.mmax_init = 0; % Initial value max Mel value, data clamp is mmax - top_db cfg.mmax_coef = 0; % Dynamic max Mel value decay coefficient (zero lock to found max) cfg.dynamic_mmax = true; + cfg.enable_vad = true; + cfg.enable_dtx = false; + cfg.dtx_trailing_silence_hops = 0; + cfg.dtx_silence_hops_interval = 0; + cfg.update_controls = true; + cfg.compress_output = false; end function export_mfcc_setup(gen_cfg, cfg) @@ -107,7 +145,7 @@ function export_mfcc_setup(gen_cfg, cfg) addpath([gen_cfg.tools_path 'tune/common']); %% Blob size, size plus reserved(8) + current parameters -nbytes_data = 104; +nbytes_data = 116; %% Little endian sh32 = [0 -8 -16 -24]; @@ -133,8 +171,10 @@ function export_mfcc_setup(gen_cfg, cfg) v = q_convert(cfg.mmax_init, 7); [b8, j] = add_w16b(v, b8, j); v = q_convert(cfg.mmax_coef, 15); [b8, j] = add_w16b(v, b8, j); +v = cfg.dtx_trailing_silence_hops; [b8, j] = add_w16b(v, b8, j); % DTX trailing silence hops +v = cfg.dtx_silence_hops_interval; [b8, j] = add_w16b(v, b8, j); % DTX silence frame interval % Reserved -for i = 1:6 +for i = 1:5 [b8, j] = add_w32b(0, b8, j); end @@ -160,6 +200,10 @@ function export_mfcc_setup(gen_cfg, cfg) v = 0; [b8, j] = add_w16b(v, b8, j); % vtln_high Qx.y TBD v = 0; [b8, j] = add_w16b(v, b8, j); % vtln_low Qx.y TBD v = 0; [b8, j] = add_w16b(v, b8, j); % vtln_warp Qx.y TBD +% reserved16[3] +for i = 1:3 + [b8, j] = add_w16b(0, b8, j); +end v = cfg.htk_compat; [b8, j] = add_w8b(v, b8, j); % bool v = cfg.raw_energy; [b8, j] = add_w8b(v, b8, j); % bool v = cfg.remove_dc_offset; [b8, j] = add_w8b(v, b8, j); % bool @@ -168,6 +212,14 @@ function export_mfcc_setup(gen_cfg, cfg) v = cfg.subtract_mean; [b8, j] = add_w8b(v, b8, j); % bool v = cfg.use_energy; [b8, j] = add_w8b(v, b8, j); % bool v = cfg.dynamic_mmax; [b8, j] = add_w8b(v, b8, j); % bool +v = cfg.enable_vad; [b8, j] = add_w8b(v, b8, j); % bool +v = cfg.enable_dtx; [b8, j] = add_w8b(v, b8, j); % bool +v = cfg.update_controls; [b8, j] = add_w8b(v, b8, j); % bool +v = cfg.compress_output; [b8, j] = add_w8b(v, b8, j); % bool +% reserved_bool[4] +for i = 1:4 + [b8, j] = add_w8b(0, b8, j); +end %% Export tplg_fn = [gen_cfg.mfcc_conf_path cfg.tplg_fn]; diff --git a/src/audio/mfcc/tune/sof_ceps_spectrogram_compress.py b/src/audio/mfcc/tune/sof_ceps_spectrogram_compress.py new file mode 100644 index 000000000000..3a61641c0812 --- /dev/null +++ b/src/audio/mfcc/tune/sof_ceps_spectrogram_compress.py @@ -0,0 +1,234 @@ +"""Live scrolling cepstral coefficient viewer for SOF compress PCM capture. + +Displays a real-time scrolling MFCC (cepstral coefficient) plot and VAD +strip from ALSA compress PCM capture (crecord) with embedded DSP VAD flag. + +Frame format: [magic(int32), frame_number(uint32), reserved(int32), + energy(int32), noise_energy(int32), vad_flag(int32), + ceps[0..N-1](int32)] + +Cepstral coefficients are in Q9.23 fixed-point format. + +Usage: + python sof_ceps_spectrogram_compress.py [--card 0] [--device 48] + python sof_ceps_spectrogram_compress.py --num-ceps 13 --width 300 +""" + +import argparse +import os +import queue +import struct +import subprocess +import threading +import numpy as np +import matplotlib +matplotlib.use('TkAgg') +import matplotlib.pyplot as plt + +# SOF compress frame format constants (with DSP data header) +SOF_MAGIC_BYTES = struct.pack(' 3: + del buf[:-3] + return None, None, None + end = idx + frame_bytes + if end > len(buf): + del buf[:idx] + return None, None, None + + frame_number = struct.unpack_from(' 3: + del buf[:-3] + return None, None, None + end = idx + SOF_FRAME_BYTES + if end > len(buf): + del buf[:idx] + return None, None, None + + frame_number = struct.unpack_from(' 3: + del buf[:-3] + return None, None, None + end = idx + SOF_FRAME_BYTES + if end > len(buf): + del buf[:idx] + return None, None, None + + # Parse header fields + frame_number = struct.unpack_from('> \"{text}\"\n", flush=True) + else: + print(" [Whisper] empty result", flush=True) + + def flush_speech(t_now): + """Flush speech buffer to Whisper.""" + nonlocal speech_buffer, silence_time, pending_queue, pending_t + if not speech_buffer: + silence_time = None + return + if not try_transcribe(transcriber, speech_buffer, t_now, + on_transcription): + pending_queue = list(speech_buffer) + pending_t = t_now + speech_buffer.clear() + silence_time = None + + try: + while True: + # Calculate queue timeout based on patience timer + get_timeout = 0.1 # default polling interval + if silence_time is not None: + remaining = patience - (time.monotonic() - silence_time) + get_timeout = max(remaining, 0.01) + + try: + item = frame_q.get(timeout=get_timeout) + except queue.Empty: + # Patience expired — flush speech to Whisper + if silence_time is not None: + elapsed = time.monotonic() - silence_time + if elapsed >= patience: + t = last_hop * SOF_HOP_SEC + flush_speech(t) + + # Drain pending queue when Whisper becomes free + if pending_queue is not None and not transcriber.is_busy(): + print(f" [{pending_t:7.2f}s] Whisper free, sending " + f"{len(pending_queue)} queued frames", flush=True) + transcriber.transcribe_async(pending_queue, on_transcription) + pending_queue = None + continue + + if item is None: + # Reader thread ended (crecord exited) + stderr_out = proc.stderr.read().decode(errors='replace') + rc = proc.wait() + print(f"\ncrecord exited with code {rc}") + if stderr_out: + print(f"stderr: {stderr_out}") + break + + frame_number, vad_flag, frame_ints = item + recv_frames += 1 + last_hop = frame_number + mel = decode_mel_frame(frame_ints) + speech = vad_flag != 0 + t = frame_number * SOF_HOP_SEC + + # Print VAD transitions + if speech != prev_speech: + tag = "SPEECH" if speech else "SILENCE" + print(f" [{t:7.2f}s] {tag} (hop {frame_number}, " + f"received {recv_frames})", flush=True) + prev_speech = speech + + # Drain pending queue when Whisper becomes free + if pending_queue is not None and not transcriber.is_busy(): + print(f" [{pending_t:7.2f}s] Whisper free, sending " + f"{len(pending_queue)} queued frames", flush=True) + transcriber.transcribe_async(pending_queue, on_transcription) + pending_queue = None + + # --- Speech buffering logic --- + if speech: + if len(speech_buffer) >= MAX_SPEECH_FRAMES: + n = len(speech_buffer) + duration = n * SOF_HOP_SEC + print(f" [{t:7.2f}s] Buffer full ({duration:.1f}s), " + f"forcing transcription", flush=True) + flush_speech(t) + + speech_buffer.append(mel.copy()) + silence_time = None # speech resumed, cancel patience timer + + else: + # VAD=0: start patience timer if we have buffered speech. + # Don't refresh if already running so trailing silence + # frames don't extend the wait. + if speech_buffer and silence_time is None: + silence_time = time.monotonic() + + except (KeyboardInterrupt, BrokenPipeError): + pass + finally: + # Flush remaining speech + if speech_buffer: + t = last_hop * SOF_HOP_SEC + flush_speech(t) + if proc.poll() is None: + proc.terminate() + try: + proc.wait(timeout=3) + except subprocess.TimeoutExpired: + proc.kill() + proc.wait() + print(f"\n\nCapture stopped. Received {recv_frames} frames.") + + +def main(): + parser = argparse.ArgumentParser( + description="Live SOF mel capture from compress PCM with DTX-aware " + "Whisper transcription") + parser.add_argument('--card', '-c', type=int, default=0, + help='ALSA card number (default: 0)') + parser.add_argument('--device', '-d', type=int, default=54, + help='ALSA compress device number (default: 54)') + parser.add_argument('--model', '-m', default='whisper-medium-int4-ov', + help='Path to Whisper OpenVINO model directory') + parser.add_argument('--encoder-device', default='NPU', + help='OpenVINO device for encoder (default: NPU)') + parser.add_argument('--decoder-device', default='CPU', + help='OpenVINO device for decoder (default: CPU)') + parser.add_argument('--patience', type=float, default=SILENCE_PATIENCE_S, + help=f'Seconds of silence patience before triggering ' + f'transcription (default: {SILENCE_PATIENCE_S})') + args = parser.parse_args() + + model_id = "OpenVINO/" + os.path.basename(args.model) + if not os.path.isdir(args.model): + print(f"Downloading model {model_id} ...") + hf_hub.snapshot_download(model_id, local_dir=args.model) + + print("=== Live SOF Mel → Whisper Transcription (Compress PCM, DTX) ===\n") + run_capture(args.card, args.device, args.model, args.encoder_device, + args.decoder_device, patience=args.patience) + + +if __name__ == '__main__': + main() diff --git a/src/audio/mfcc/tune/sof_mel_to_text_live_dsp_vad.py b/src/audio/mfcc/tune/sof_mel_to_text_live_dsp_vad.py new file mode 100644 index 000000000000..9171df2e3cec --- /dev/null +++ b/src/audio/mfcc/tune/sof_mel_to_text_live_dsp_vad.py @@ -0,0 +1,384 @@ +"""Live SOF mel capture with DSP VAD-triggered Whisper transcription. + +Captures mel frames from ALSA with embedded VAD flag from the DSP. +Frame format: [magic(int32), frame_number(uint32), reserved(int32), energy(int32), noise_energy(int32), vad_flag(int32), mel[0..79](int32)] +When silence of 100ms is detected after speech, sends the buffered mel +features to Whisper (OpenVINO encoder+decoder) for transcription. +Capture continues running during Whisper inference. + +Usage: + python sof_mel_to_text_live_dsp_vad.py [--device hw:0,47] [--model whisper-medium-int4-ov] +""" + +import argparse +import os +import struct +import subprocess +import threading +import time +import numpy as np +import openvino as ov +import huggingface_hub as hf_hub +from pathlib import Path + +# SOF mel_s32.raw format constants (with DSP data header) +SOF_MAGIC_BYTES = struct.pack(' 3: + del buf[:-3] + return None, None + end = idx + SOF_FRAME_BYTES + if end > len(buf): + del buf[:idx] + return None, None + # Parse vad_flag at offset 20 (after magic + frame_number + reserved + energy + noise_energy) + vad_flag = struct.unpack_from('> \"{text}\"\n", flush=True) + else: + print(" [Whisper] empty result", flush=True) + + try: + while True: + data = proc.stdout.read(read_chunk) + if not data: + rc = proc.poll() + if rc is not None: + stderr_out = proc.stderr.read().decode(errors='replace') + print(f"\narecord exited with code {rc}") + if stderr_out: + print(f"stderr: {stderr_out}") + break + continue + + buf.extend(data) + + while True: + vad_flag, frame_ints = find_frame_in_buffer(buf) + if frame_ints is None: + break + + frame_num += 1 + mel = decode_mel_frame(frame_ints) + speech = vad_flag != 0 + + # Print VAD transitions + if speech != prev_speech: + t = frame_num * 0.01 + tag = "SPEECH" if speech else "SILENCE" + print(f" [{t:7.2f}s] {tag}", flush=True) + prev_speech = speech + + # --- Speech buffering logic --- + if speech: + if len(speech_buffer) >= MAX_SPEECH_FRAMES: + n = len(speech_buffer) + duration = n * 0.01 + t = frame_num * 0.01 + print(f" [{t:7.2f}s] Buffer full ({duration:.1f}s), " + f"forcing transcription of {n} frames", + flush=True) + if not transcriber.is_busy(): + frames_copy = list(speech_buffer) + transcriber.transcribe_async( + frames_copy, on_transcription) + else: + print(f" [{t:7.2f}s] (Whisper busy, " + f"dropping {n} frames)", flush=True) + speech_buffer.clear() + speech_buffer.append(mel.copy()) + silence_counter = 0 + was_speaking = True + else: + if was_speaking: + silence_counter += 1 + if silence_counter >= SILENCE_TRIGGER_FRAMES: + n = len(speech_buffer) + duration = n * 0.01 + t = frame_num * 0.01 + + if n < MIN_SPEECH_FRAMES: + # Too short — discard + speech_buffer.clear() + silence_counter = 0 + was_speaking = False + continue + + # Silence threshold reached — send to Whisper + print(f" [{t:7.2f}s] Transcribing {n} frames " + f"({duration:.1f}s)...", flush=True) + + if not transcriber.is_busy(): + frames_copy = list(speech_buffer) + transcriber.transcribe_async( + frames_copy, on_transcription) + else: + print(f" [{t:7.2f}s] (Whisper busy, " + f"dropping {n} frames)", flush=True) + + speech_buffer.clear() + silence_counter = 0 + was_speaking = False + + except (KeyboardInterrupt, BrokenPipeError): + pass + finally: + if proc.poll() is None: + proc.terminate() + try: + proc.wait(timeout=3) + except subprocess.TimeoutExpired: + proc.kill() + proc.wait() + print("\n\nCapture stopped.") + + +def main(): + parser = argparse.ArgumentParser( + description="Live SOF mel capture with DSP VAD-triggered Whisper transcription") + parser.add_argument('--device', '-D', default='hw:0,47', + help='ALSA capture device (default: hw:0,47)') + parser.add_argument('--rate', '-r', type=int, default=16000, + help='Sample rate for arecord (default: 16000)') + parser.add_argument('--model', '-m', default='whisper-medium-int4-ov', + help='Path to Whisper OpenVINO model directory') + parser.add_argument('--encoder-device', default='NPU', + help='OpenVINO device for encoder (default: NPU)') + parser.add_argument('--decoder-device', default='CPU', + help='OpenVINO device for decoder (default: CPU)') + args = parser.parse_args() + model_id = "OpenVINO/" + os.path.basename(args.model) + if not os.path.isdir(args.model): + print(f"Downloading model {model_id} ...") + hf_hub.snapshot_download(model_id, local_dir=args.model) + + print("=== Live SOF Mel → Whisper Transcription (DSP VAD) ===\n") + run_capture(args.device, args.rate, args.model, args.encoder_device, + args.decoder_device) + + +if __name__ == '__main__': + main() diff --git a/src/include/sof/audio/mfcc/mfcc_comp.h b/src/include/sof/audio/mfcc/mfcc_comp.h index 025eef116752..b380cd84fdf0 100644 --- a/src/include/sof/audio/mfcc/mfcc_comp.h +++ b/src/include/sof/audio/mfcc/mfcc_comp.h @@ -9,9 +9,12 @@ #define __SOF_AUDIO_MFCC_MFCC_COMP_H__ #include +#include #include #include #include +#include +#include #include #include @@ -31,17 +34,24 @@ #define MFCC_MAGIC 0x6d666363 /* ASCII for "mfcc" */ #define MFCC_FFT_BITS 32 +#define MFCC_MAX_SAMPLE_RATE 64000 /* Max sample rate in Hz, limited by int16_t Mel scale */ -/** \brief Type definition for processing function select return value. */ -typedef void (*mfcc_func)(struct processing_module *mod, - struct input_stream_buffer *bsource, - struct output_stream_buffer *bsink, - int frames); +/** \brief Switch control index for VAD notification to user space */ +#define MFCC_CTRL_INDEX_VAD 0 -/** \brief MFCC processing functions map item. */ -struct mfcc_func_map { - uint8_t source; /**< source frame format */ - mfcc_func func; /**< processing function */ +/** + * \brief Data header prepended to every MFCC output frame. + * + * Written before the Mel spectrum or cepstral coefficient data in each + * output frame. + */ +struct mfcc_data_header { + uint32_t magic; /**< Magic word MFCC_MAGIC (0x6d666363) */ + uint32_t frame_number; /**< Frame number, counting calculated frames starting from 0 */ + int32_t reserved; /**< Reserved for future use, set to 0 */ + int32_t energy; /**< Weighted signal energy in Q9.23 */ + int32_t noise_energy; /**< Weighted noise floor energy in Q9.23 */ + int32_t vad_flag; /**< VAD decision: 1 = speech, 0 = silence */ }; struct mfcc_buffer { @@ -60,6 +70,10 @@ struct mfcc_pre_emph { int enable; }; +/** \brief Type definition for source/sink based input copy function. */ +typedef void (*mfcc_source_func)(struct sof_source *source, struct mfcc_buffer *buf, + struct mfcc_pre_emph *emph, int frames, int source_channel); + struct mfcc_fft { struct icomplex32 *fft_buf; /**< fft_padded_size */ struct icomplex32 *fft_out; /**< fft_padded_size */ @@ -105,20 +119,29 @@ struct mfcc_state { bool mel_only; /**< When true, output Mel spectra instead of cepstral coefficients */ bool waiting_fill; /**< booleans */ bool prev_samples_valid; - bool magic_pending; /**< True when magic word not yet written for current output */ + bool header_pending; /**< True when data header not yet written for current output */ + struct mfcc_data_header header; /**< Data header for current output frame */ size_t sample_buffers_size; /**< bytes */ - int16_t *out_data_ptr; /**< Read pointer into scratch data for multi-period output */ - int32_t *out_data_ptr_32; /**< Read pointer for 32-bit mel-only output */ - int out_remain; /**< Remaining int16_t samples to write to sink from scratch */ + int32_t *out_data_ptr; /**< Read pointer into scratch data for multi-period output */ + int out_remain; /**< Remaining int32_t samples to write to sink from scratch */ + uint32_t hop_count; /**< FFT hop counter, increments every processed hop */ + int vad_silence_count; /**< Consecutive VAD=0 hops since last speech */ + int16_t dtx_trailing_silence; /**< Number of trailing silence hops to send, from config */ + int16_t dtx_silence_interval; /**< Send silence frame every Nth hop, 0 = disable */ + int dtx_silence_counter; /**< Counter for periodic silence frame send */ }; /* MFCC component private data */ struct mfcc_comp_data { struct mfcc_state state; + struct mfcc_vad_state vad; struct comp_data_blob_handler *model_handler; struct sof_mfcc_config *config; + struct ipc_msg *msg; /**< IPC notification for VAD switch control */ int max_frames; - mfcc_func mfcc_func; /**< processing function */ + enum sof_ipc_frame source_format; /**< Source audio format for output sizing */ + bool vad_prev; /**< Previous VAD state for edge detection */ + mfcc_source_func source_func; /**< source copy function */ }; static inline int mfcc_buffer_samples_without_wrap(struct mfcc_buffer *buffer, int16_t *ptr) @@ -145,31 +168,83 @@ void mfcc_fill_fft_buffer(struct mfcc_state *state); void mfcc_apply_window(struct mfcc_state *state, int input_shift); -#if CONFIG_FORMAT_S16LE +/** + * \brief Run STFT and Mel/DCT processing. + * \return Number of output coefficients produced, or 0 if not enough data. + */ +int mfcc_stft_process(struct processing_module *mod, struct mfcc_comp_data *cd); -void mfcc_source_copy_s16(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, - struct mfcc_pre_emph *emph, int frames, int source_channel); +/** + * \brief Prepare and commit MFCC output data after STFT processing. + * + * This handles the output data conversion and dispatches to either the + * compress-output or legacy PCM-output path. + * + * \return 0 on success or a negative error code. + */ +int mfcc_process_output(struct processing_module *mod, struct mfcc_comp_data *cd, + struct sof_source **sources, struct sof_sink **sinks, + int num_ceps, int frames); -void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer *bsource, - struct output_stream_buffer *bsink, int frames); +#if CONFIG_FORMAT_S16LE +void mfcc_source_copy_s16(struct sof_source *source, struct mfcc_buffer *buf, + struct mfcc_pre_emph *emph, int frames, int source_channel); #endif #if CONFIG_FORMAT_S24LE - -void mfcc_source_copy_s24(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, +void mfcc_source_copy_s24(struct sof_source *source, struct mfcc_buffer *buf, struct mfcc_pre_emph *emph, int frames, int source_channel); - -void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer *bsource, - struct output_stream_buffer *bsink, int frames); #endif #if CONFIG_FORMAT_S32LE - -void mfcc_source_copy_s32(struct input_stream_buffer *bsource, struct mfcc_buffer *buf, +void mfcc_source_copy_s32(struct sof_source *source, struct mfcc_buffer *buf, struct mfcc_pre_emph *emph, int frames, int source_channel); +#endif + +#if CONFIG_IPC_MAJOR_4 +int mfcc_ipc_notification_init(struct processing_module *mod); + +void mfcc_send_vad_notification(struct processing_module *mod, uint32_t val); -void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer *bsource, - struct output_stream_buffer *bsink, int frames); +int mfcc_get_config(struct processing_module *mod, + uint32_t config_id, uint32_t *data_offset_size, + uint8_t *fragment, size_t fragment_size); + +int mfcc_set_config(struct processing_module *mod, uint32_t config_id, + enum module_cfg_fragment_position pos, uint32_t data_offset_size, + const uint8_t *fragment, size_t fragment_size, uint8_t *response, + size_t response_size); + +#else +static inline int mfcc_ipc_notification_init(struct processing_module *mod) +{ + return 0; +} + +static inline void mfcc_send_vad_notification(struct processing_module *mod, uint32_t val) +{ +} + +static inline int mfcc_get_config(struct processing_module *mod, + uint32_t config_id, uint32_t *data_offset_size, + uint8_t *fragment, size_t fragment_size) +{ + struct sof_ipc_ctrl_data *cdata = (struct sof_ipc_ctrl_data *)fragment; + struct mfcc_comp_data *cd = module_get_private_data(mod); + + return comp_data_blob_get_cmd(cd->model_handler, cdata, fragment_size); +} + +static inline int mfcc_set_config(struct processing_module *mod, uint32_t config_id, + enum module_cfg_fragment_position pos, uint32_t data_offset_size, + const uint8_t *fragment, size_t fragment_size, uint8_t *response, + size_t response_size) +{ + struct mfcc_comp_data *cd = module_get_private_data(mod); + + return comp_data_blob_set(cd->model_handler, pos, data_offset_size, + fragment, fragment_size); +} #endif #ifdef UNIT_TEST diff --git a/src/include/sof/audio/mfcc/mfcc_vad.h b/src/include/sof/audio/mfcc/mfcc_vad.h new file mode 100644 index 000000000000..6873343d334e --- /dev/null +++ b/src/include/sof/audio/mfcc/mfcc_vad.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * + * Copyright(c) 2026 Intel Corporation. + * + * Author: Seppo Ingalsuo + */ + +/** + * \file mfcc_vad.h + * \brief Voice Activity Detection based on Mel spectrum energy. + * + * This VAD operates on the Q9.23 Mel log spectrum values produced by + * the MFCC component. It tracks a per-bin noise floor that follows + * the signal downward instantly and rises slowly, then computes a + * speech-weighted energy delta above the floor. + */ + +#ifndef __SOF_AUDIO_MFCC_MFCC_VAD_H__ +#define __SOF_AUDIO_MFCC_MFCC_VAD_H__ + +#include +#include + +struct processing_module; + +/** + * \brief Number of frames for fast noise floor convergence at startup (~1 s at 10 ms/frame). + */ +#define MFCC_VAD_NOISE_INIT_FRAMES 100 + +/** + * \brief Slow noise floor rise coefficient in Q1.15 (0.003 * 2^15). + */ +#define MFCC_VAD_NOISE_RISE_ALPHA 98 + +/** + * \brief Fast noise floor rise coefficient in Q1.15 (0.020 * 2^15). + */ +#define MFCC_VAD_NOISE_RISE_ALPHA_FAST 655 + +/** + * \brief Energy threshold for speech detection in Q9.23 (0.30 * 2^23). + */ +#define MFCC_VAD_ENERGY_THRESHOLD 2516582 + +/** + * \brief Hangover frame count to keep VAD active after last speech detection. + */ +#define MFCC_VAD_HANGOVER_FRAMES 20 + +/** + * \brief VAD state structure. + */ +struct mfcc_vad_state { + int32_t *noise_floor; /**< Per-bin noise floor in Q9.23 */ + int16_t *weights; /**< Speech-frequency emphasis weights Q1.15 */ + int32_t energy; /**< Weighted signal energy in Q9.23 */ + int32_t energy_threshold; /**< Energy threshold Q9.23 */ + int32_t noise_energy; /**< Weighted noise floor energy in Q9.23 */ + int16_t frame_count; /**< Initial convergence frames processed */ + int16_t hangover_counter; /**< Current hangover counter */ + int16_t hangover_max; /**< Maximum hangover frames */ + int16_t init_frames; /**< Number of initial frames for fast convergence */ + int16_t noise_rise_alpha_fast; /**< Fast rise alpha Q1.15 */ + int16_t noise_rise_alpha_slow; /**< Slow rise alpha Q1.15 */ + int16_t num_mel_bins; /**< Number of Mel bins in use */ + bool initialized; /**< True after first frame processed */ + bool is_speech; /**< Current VAD decision */ +}; + +/** + * \brief Initialize VAD state. + * + * \param[out] vad Pointer to VAD state to initialize. + * \param[in] num_mel_bins Number of Mel bins. + * \param[in] sample_rate Audio sample rate in Hz. + * \param[in] mod Processing module for memory allocation. + * \return 0 on success, negative error code on failure. + */ +int mfcc_vad_init(struct mfcc_vad_state *vad, int num_mel_bins, int32_t sample_rate, + struct processing_module *mod); + +/** + * \brief Process one Mel spectrum frame and update VAD decision. + * + * \param[in,out] vad Pointer to VAD state. + * \param[in] mel_log Mel log spectrum in Q9.23, array of num_mel_bins values. + * \return 1 if speech detected, 0 if silence. + */ +int mfcc_vad_update(struct mfcc_vad_state *vad, const int32_t *mel_log); + +#endif /* __SOF_AUDIO_MFCC_MFCC_VAD_H__ */ diff --git a/src/include/user/mfcc.h b/src/include/user/mfcc.h index 8a0defcd9883..286ee4f5e985 100644 --- a/src/include/user/mfcc.h +++ b/src/include/user/mfcc.h @@ -54,7 +54,9 @@ struct sof_mfcc_config { int16_t mel_scale; /**< Q4.12 default 1.0, use 0.25 for Whisper */ int16_t mmax_init; /**< Q8.7 default 0, with dynamic_mmax false, can sim. Whisper mmax */ int16_t mmax_coef; /**< Q1.15 decay coefficient for dynamic mmax, a small value for slow */ - uint32_t reserved[6]; + uint16_t dtx_trailing_silence_hops; /**< DTX: number of silence hops to send after speech, 0 = send first only */ + uint16_t dtx_silence_hops_interval; /**< DTX: send silence frame every Nth hop during VAD=0, 0 = disable */ + uint32_t reserved[5]; int32_t sample_frequency; /**< Hz. e.g. 16000 */ int32_t pmin; /**< Q1.31 linear power, limit minimum Mel energy, e.g. 1e-9 */ enum sof_mfcc_mel_log_type mel_log; /**< Use MEL_LOG_IS_LOG, LOG10 or DB*/ @@ -77,6 +79,7 @@ struct sof_mfcc_config { int16_t vtln_high; /**< Reserved, no support */ int16_t vtln_low; /**< Reserved, no support */ int16_t vtln_warp; /**< Reserved, no support */ + int16_t reserved16[3]; /**< Reserved for future 16-bit fields, set to 0 */ bool htk_compat; /**< Must be false */ bool raw_energy; /**< Reserved, no support */ bool remove_dc_offset; /**< Reserved, no support */ @@ -85,8 +88,11 @@ struct sof_mfcc_config { bool subtract_mean; /**< Must be false (0) */ bool use_energy; /**< Must be false (0) */ bool dynamic_mmax; /**< Track max Mel value for clamp with top_db value */ - bool reserved_bool2; - bool reserved_bool3; + bool enable_vad; /**< Run VAD algorithm */ + bool enable_dtx; /**< Discontinuous transmission: suppress silence after trailing frames */ + bool update_controls; /**< Update controls with VAD decision */ + bool compress_output; /**< Use compress PCM output: variable size, no zero padding */ + bool reserved_bool[4]; /* Reserved for future boolean flags, set to false (0) */ } __attribute__((packed)); #endif /* __USER_MFCC_H__ */ diff --git a/tools/topology/topology2/cavs-sdw.conf b/tools/topology/topology2/cavs-sdw.conf index 6932543c06e5..0f597ded3793 100644 --- a/tools/topology/topology2/cavs-sdw.conf +++ b/tools/topology/topology2/cavs-sdw.conf @@ -254,6 +254,14 @@ IncludeByKey.SDW_JACK_AUDIO_FEATURE_CAPTURE { "true" "platform/intel/sdw-jack-audio-feature.conf" } +IncludeByKey.SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE { + "true" "platform/intel/sdw-jack-audio-feature-compress.conf" +} + IncludeByKey.SDW_DMIC_AUDIO_FEATURE_CAPTURE { "true" "platform/intel/sdw-dmic-audio-feature.conf" } + +IncludeByKey.SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE { + "true" "platform/intel/sdw-dmic-audio-feature-compress.conf" +} diff --git a/tools/topology/topology2/development/tplg-targets.cmake b/tools/topology/topology2/development/tplg-targets.cmake index a906852d04f0..155176c16347 100644 --- a/tools/topology/topology2/development/tplg-targets.cmake +++ b/tools/topology/topology2/development/tplg-targets.cmake @@ -479,11 +479,33 @@ SDW_AMP_FEEDBACK=false,SDW_SPK_STREAM=Playback-SmartAmp,SDW_DMIC_STREAM=Capture- SDW_JACK_OUT_STREAM=Playback-SimpleJack,SDW_JACK_IN_STREAM=Capture-SimpleJack,COMPRESSED=true" # Soundwire topologies with MFCC audio features capture -"cavs-sdw\;sof-mtl-rt713-l0-rt1316-l12-mfcc\;PLATFORM=mtl,NUM_SDW_AMP_LINKS=2,\ +"cavs-sdw\;sof-mtl-rt713-l0-rt1316-l12-mfcc-mel-normal\;PLATFORM=mtl,NUM_SDW_AMP_LINKS=2,\ HDMI1_ID=4,HDMI2_ID=5,HDMI3_ID=6,SDW_JACK_AUDIO_FEATURE_CAPTURE=true" -"cavs-sdw\;sof-arl-cs42l43-l0-cs35l56-l23-mfcc\;PLATFORM=mtl,NUM_SDW_AMP_LINKS=2,SDW_DMIC=1,\ +"cavs-sdw\;sof-arl-cs42l43-l0-cs35l56-l23-mfcc-mel-normal\;PLATFORM=mtl,NUM_SDW_AMP_LINKS=2,SDW_DMIC=1,\ SDW_AMP_FEEDBACK=false,SDW_SPK_STREAM=Playback-SmartAmp,SDW_DMIC_STREAM=Capture-SmartMic,\ SDW_JACK_OUT_STREAM=Playback-SimpleJack,SDW_JACK_IN_STREAM=Capture-SimpleJack,\ SDW_JACK_AUDIO_FEATURE_CAPTURE=true,SDW_DMIC_AUDIO_FEATURE_CAPTURE=true" + +# Soundwire topologies with compress MFCC mel audio features capture +"cavs-sdw\;sof-mtl-rt713-l0-rt1316-l12-mfcc-mel-compr\;PLATFORM=mtl,NUM_SDW_AMP_LINKS=2,\ +HDMI1_ID=4,HDMI2_ID=5,HDMI3_ID=6,SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE=true,\ +MFCC_FRAME_BYTES=344,MFCC_BLOB=mel" + +# Soundwire topologies with compress MFCC cepstral audio features capture +"cavs-sdw\;sof-mtl-rt713-l0-rt1316-l12-mfcc-ceps-compr\;PLATFORM=mtl,NUM_SDW_AMP_LINKS=2,\ +HDMI1_ID=4,HDMI2_ID=5,HDMI3_ID=6,SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE=true,\ +MFCC_FRAME_BYTES=76,MFCC_BLOB=ceps" + +"cavs-sdw\;sof-arl-cs42l43-l0-cs35l56-l23-mfcc-mel-compr\;PLATFORM=mtl,NUM_SDW_AMP_LINKS=2,SDW_DMIC=1,\ +SDW_AMP_FEEDBACK=false,SDW_SPK_STREAM=Playback-SmartAmp,SDW_DMIC_STREAM=Capture-SmartMic,\ +SDW_JACK_OUT_STREAM=Playback-SimpleJack,SDW_JACK_IN_STREAM=Capture-SimpleJack,\ +SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE=true,SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE=true,\ +MFCC_FRAME_BYTES=344,MFCC_BLOB=mel" + +"cavs-sdw\;sof-arl-cs42l43-l0-cs35l56-l23-mfcc-ceps-compr\;PLATFORM=mtl,NUM_SDW_AMP_LINKS=2,SDW_DMIC=1,\ +SDW_AMP_FEEDBACK=false,SDW_SPK_STREAM=Playback-SmartAmp,SDW_DMIC_STREAM=Capture-SmartMic,\ +SDW_JACK_OUT_STREAM=Playback-SimpleJack,SDW_JACK_IN_STREAM=Capture-SimpleJack,\ +SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE=true,SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE=true,\ +MFCC_FRAME_BYTES=76,MFCC_BLOB=ceps" ) diff --git a/tools/topology/topology2/include/bench/mfcc_controls_capture.conf b/tools/topology/topology2/include/bench/mfcc_controls_capture.conf index d45baec1ee8f..8788387ec8c7 100644 --- a/tools/topology/topology2/include/bench/mfcc_controls_capture.conf +++ b/tools/topology/topology2/include/bench/mfcc_controls_capture.conf @@ -9,9 +9,9 @@ "mel80" "include/components/mfcc/mel80.conf" } } - #mixer."1" { - # name '$ANALOG_CAPTURE_PCM MFCC switch or volume' - #} + mixer."1" { + name '$ANALOG_CAPTURE_PCM MFCC switch' + } #enum."1" { # name '$ANALOG_CAPTURE_PCM MFCC enum' #} diff --git a/tools/topology/topology2/include/bench/mfcc_controls_playback.conf b/tools/topology/topology2/include/bench/mfcc_controls_playback.conf index cc2ada04b8d7..007dbb91cd4f 100644 --- a/tools/topology/topology2/include/bench/mfcc_controls_playback.conf +++ b/tools/topology/topology2/include/bench/mfcc_controls_playback.conf @@ -9,9 +9,9 @@ "mel80" "include/components/mfcc/mel80.conf" } } - #mixer."1" { - # name '$ANALOG_PLAYBACK_PCM MFCC switch or volume' - #} + mixer."1" { + name '$ANALOG_PLAYBACK_PCM MFCC switch' + } #enum."1" { # name '$ANALOG_PLAYBACK_PCM MFCC enum' #} diff --git a/tools/topology/topology2/include/common/common_definitions.conf b/tools/topology/topology2/include/common/common_definitions.conf index 87c69dd41e41..06f0f425c5e2 100644 --- a/tools/topology/topology2/include/common/common_definitions.conf +++ b/tools/topology/topology2/include/common/common_definitions.conf @@ -72,5 +72,7 @@ Define { SDW_JACK_ECHO_REF false # No echo reference for 3.5mm jack SDW_SPK_ECHO_REF false # No echo reference for speaker SDW_JACK_AUDIO_FEATURE_CAPTURE false # No audio features capture for jack + SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE false # No compress audio features capture for jack SDW_DMIC_AUDIO_FEATURE_CAPTURE false # No audio features capture for microphone + SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE false # No compress audio features capture for microphone } diff --git a/tools/topology/topology2/include/components/mfcc.conf b/tools/topology/topology2/include/components/mfcc.conf index 221df8f2d437..bf908e685048 100644 --- a/tools/topology/topology2/include/components/mfcc.conf +++ b/tools/topology/topology2/include/components/mfcc.conf @@ -13,6 +13,8 @@ # # Where M is pipeline ID and N is a unique integer in the parent object. + + Class.Widget."mfcc" { # # Pipeline ID @@ -45,7 +47,6 @@ Class.Widget."mfcc" { !immutable [ "uuid" - "type" ] !deprecated [ "preload_count" @@ -53,6 +54,26 @@ Class.Widget."mfcc" { unique "instance" } + # + # MFCC Widget switch control to optionally notify VAD state changes + # + Object.Control { + mixer."1" { + Object.Base.channel.1 { + name "fc" + shift 0 + } + Object.Base.ops.1 { + name "ctl" + info "volsw" + #259 binds the mixer control to switch get/put handlers + get 259 + put 259 + } + max 1 + } + } + # # Default attributes for mfcc # diff --git a/tools/topology/topology2/include/components/mfcc/ceps13_compress_dtx.conf b/tools/topology/topology2/include/components/mfcc/ceps13_compress_dtx.conf new file mode 100644 index 000000000000..7056b9e7cb4b --- /dev/null +++ b/tools/topology/topology2/include/components/mfcc/ceps13_compress_dtx.conf @@ -0,0 +1,24 @@ +# Exported MFCC configuration 26-May-2026 +# cd src/audio/mfcc/tune; octave setup_mfcc.m +Object.Base.data."mfcc_config" { + bytes " + 0x53,0x4f,0x46,0x34,0x00,0x00,0x00,0x00, + 0x74,0x00,0x00,0x00,0x01,0xd0,0x01,0x03, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x74,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x14,0x00,0xf4,0x01, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x80,0x3e,0x00,0x00, + 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x02,0x00,0x00,0x00,0x01,0x00,0x00,0x00, + 0xc3,0x35,0x00,0x2c,0x00,0x00,0x00,0x00, + 0x90,0x01,0xa0,0x00,0x00,0x00,0x14,0x00, + 0x0d,0x00,0x17,0x00,0x00,0x00,0x00,0x64, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x01,0x01,0x01, + 0x01,0x00,0x00,0x00,0x01,0x01,0x01,0x01, + 0x00,0x00,0x00,0x00" +} diff --git a/tools/topology/topology2/include/components/mfcc/default.conf b/tools/topology/topology2/include/components/mfcc/default.conf index 42a6d6608b8b..0ac19fa71d04 100644 --- a/tools/topology/topology2/include/components/mfcc/default.conf +++ b/tools/topology/topology2/include/components/mfcc/default.conf @@ -1,12 +1,12 @@ -# Exported MFCC configuration 05-May-2026 +# Exported MFCC configuration 26-May-2026 # cd src/audio/mfcc/tune; octave setup_mfcc.m Object.Base.data."mfcc_config" { bytes " 0x53,0x4f,0x46,0x34,0x00,0x00,0x00,0x00, - 0x68,0x00,0x00,0x00,0x01,0xd0,0x01,0x03, + 0x74,0x00,0x00,0x00,0x01,0xd0,0x01,0x03, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x68,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x74,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, @@ -17,6 +17,8 @@ Object.Base.data."mfcc_config" { 0xc3,0x35,0x00,0x2c,0x00,0x00,0x00,0x00, 0x90,0x01,0xa0,0x00,0x00,0x00,0x14,0x00, 0x0d,0x00,0x17,0x00,0x00,0x00,0x00,0x64, - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01, - 0x01,0x01,0x01,0x00,0x00,0x00,0x00,0x00" + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x01,0x01,0x01, + 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00" } diff --git a/tools/topology/topology2/include/components/mfcc/mel80.conf b/tools/topology/topology2/include/components/mfcc/mel80.conf index 04aa2a15c660..b18baadd459b 100644 --- a/tools/topology/topology2/include/components/mfcc/mel80.conf +++ b/tools/topology/topology2/include/components/mfcc/mel80.conf @@ -1,12 +1,12 @@ -# Exported MFCC configuration 05-May-2026 +# Exported MFCC configuration 26-May-2026 # cd src/audio/mfcc/tune; octave setup_mfcc.m Object.Base.data."mfcc_config" { bytes " 0x53,0x4f,0x46,0x34,0x00,0x00,0x00,0x00, - 0x68,0x00,0x00,0x00,0x01,0xd0,0x01,0x03, + 0x74,0x00,0x00,0x00,0x01,0xd0,0x01,0x03, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x68,0x00,0x00,0x00,0x00,0x02,0x00,0x04, + 0x74,0x00,0x00,0x00,0x00,0x02,0x00,0x04, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, @@ -18,5 +18,7 @@ Object.Base.data."mfcc_config" { 0x90,0x01,0xa0,0x00,0x40,0x1f,0x00,0x00, 0x00,0x00,0x50,0x00,0x00,0x00,0x00,0x04, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, - 0x00,0x01,0x01,0x00,0x00,0x01,0x00,0x00" + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01, + 0x01,0x00,0x00,0x01,0x01,0x00,0x01,0x00, + 0x00,0x00,0x00,0x00" } diff --git a/tools/topology/topology2/include/components/mfcc/mel80_compress.conf b/tools/topology/topology2/include/components/mfcc/mel80_compress.conf new file mode 100644 index 000000000000..f26f2af6980c --- /dev/null +++ b/tools/topology/topology2/include/components/mfcc/mel80_compress.conf @@ -0,0 +1,24 @@ +# Exported MFCC configuration 26-May-2026 +# cd src/audio/mfcc/tune; octave setup_mfcc.m +Object.Base.data."mfcc_config" { + bytes " + 0x53,0x4f,0x46,0x34,0x00,0x00,0x00,0x00, + 0x74,0x00,0x00,0x00,0x01,0xd0,0x01,0x03, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x74,0x00,0x00,0x00,0x00,0x02,0x00,0x04, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x80,0x3e,0x00,0x00, + 0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00, + 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x03,0x00,0x00,0x00,0x01,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x90,0x01,0xa0,0x00,0x40,0x1f,0x00,0x00, + 0x00,0x00,0x50,0x00,0x00,0x00,0x00,0x04, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01, + 0x01,0x00,0x00,0x01,0x01,0x00,0x01,0x01, + 0x00,0x00,0x00,0x00" +} diff --git a/tools/topology/topology2/include/components/mfcc/mel80_compress_dtx.conf b/tools/topology/topology2/include/components/mfcc/mel80_compress_dtx.conf new file mode 100644 index 000000000000..d225811ca4d1 --- /dev/null +++ b/tools/topology/topology2/include/components/mfcc/mel80_compress_dtx.conf @@ -0,0 +1,24 @@ +# Exported MFCC configuration 26-May-2026 +# cd src/audio/mfcc/tune; octave setup_mfcc.m +Object.Base.data."mfcc_config" { + bytes " + 0x53,0x4f,0x46,0x34,0x00,0x00,0x00,0x00, + 0x74,0x00,0x00,0x00,0x01,0xd0,0x01,0x03, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x74,0x00,0x00,0x00,0x00,0x02,0x00,0x04, + 0x00,0x00,0x00,0x00,0x14,0x00,0xf4,0x01, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x80,0x3e,0x00,0x00, + 0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00, + 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x03,0x00,0x00,0x00,0x01,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x90,0x01,0xa0,0x00,0x40,0x1f,0x00,0x00, + 0x00,0x00,0x50,0x00,0x00,0x00,0x00,0x04, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01, + 0x01,0x00,0x00,0x01,0x01,0x01,0x01,0x01, + 0x00,0x00,0x00,0x00" +} diff --git a/tools/topology/topology2/include/pipelines/cavs/host-gateway-src-mfcc-capture.conf b/tools/topology/topology2/include/pipelines/cavs/host-gateway-src-mfcc-capture.conf index 793f71b883ab..fe6249018ef1 100644 --- a/tools/topology/topology2/include/pipelines/cavs/host-gateway-src-mfcc-capture.conf +++ b/tools/topology/topology2/include/pipelines/cavs/host-gateway-src-mfcc-capture.conf @@ -22,6 +22,12 @@ +Define { + # Default MFCC output frame size (header + coefficients). + # Can be overridden by feature/platform includes or CMake variable overrides. + MFCC_FRAME_BYTES 344 +} + Class.Pipeline."host-gateway-src-mfcc-capture" { @@ -85,6 +91,9 @@ Class.Pipeline."host-gateway-src-mfcc-capture" { out_bit_depth 32 out_valid_bit_depth 32 out_rate 16000 + # Compress output frame: header + coefficients. + # Size set by MFCC_FRAME_BYTES Define. + obs $MFCC_FRAME_BYTES } ] } @@ -101,6 +110,8 @@ Class.Pipeline."host-gateway-src-mfcc-capture" { in_bit_depth 32 in_valid_bit_depth 32 in_rate 16000 + # Match MFCC compress output frame size + ibs $MFCC_FRAME_BYTES } ] Object.Base.output_audio_format [ @@ -108,6 +119,7 @@ Class.Pipeline."host-gateway-src-mfcc-capture" { out_bit_depth 32 out_valid_bit_depth 32 out_rate 16000 + obs $MFCC_FRAME_BYTES } ] } diff --git a/tools/topology/topology2/platform/intel/dmic1-mfcc.conf b/tools/topology/topology2/platform/intel/dmic1-mfcc.conf index f3926a283a8b..3aad756a85f5 100644 --- a/tools/topology/topology2/platform/intel/dmic1-mfcc.conf +++ b/tools/topology/topology2/platform/intel/dmic1-mfcc.conf @@ -454,11 +454,14 @@ Object.Widget.mfcc.1 { index $DMIC1_HOST_PIPELINE_ID Object.Control { bytes."1" { - name 'Analog Capture TDFB bytes' + name "Dmic1 Capture MFCC bytes" IncludeByKey.DMIC1_MFCC_PARAMS { "default" "include/components/mfcc/default.conf" } } + mixer."1" { + name "Dmic1 Capture MFCC VAD" + } } IncludeByKey.NUM_DMICS { "1" { diff --git a/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature-compress.conf b/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature-compress.conf new file mode 100644 index 000000000000..9e307043830b --- /dev/null +++ b/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature-compress.conf @@ -0,0 +1,71 @@ +Define { + SDW_DMIC_MODULE_COPIER_ID 41 + SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PCM_NAME "Microphone Compress Audio Features" + SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PCM_ID 54 + SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_STREAM_NAME "Microphone Compress Audio Features Stream" + SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PIPELINE_ID 133 + # MFCC compress output frame size in bytes: + # Mel-only (80 bins): 24 + 80*4 = 344 + # Cepstral (13 ceps): 24 + 13*4 = 76 + MFCC_FRAME_BYTES 344 + # MFCC config blob: mel or ceps + MFCC_BLOB mel +} + +Object.Pipeline.host-gateway-src-mfcc-capture [ + { + index $SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PIPELINE_ID + + Object.Widget.host-copier.1 { + stream_name "$SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_STREAM_NAME" + pcm_id $SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PCM_ID + } + + Object.Widget.mfcc.1 { + type "encoder" + Object.Control { + bytes."1" { + name "$SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC bytes" + IncludeByKey.MFCC_BLOB { + "mel" "include/components/mfcc/mel80_compress_dtx.conf" + "ceps" "include/components/mfcc/ceps13_compress_dtx.conf" + } + } + mixer."1" { + name "$SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC VAD" + } + } + } + } +] +Object.Base.route [ + { + source "module-copier.$SDW_DMIC_MODULE_COPIER_ID.0" + sink "src.$SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PIPELINE_ID.1" + } + { + source "mfcc.$SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PIPELINE_ID.1" + sink "host-copier.$SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PCM_ID.capture" + } +] + +Object.PCM.pcm [ + { + name "$SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PCM_NAME" + id $SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PCM_ID + direction "capture" + compress "true" + + Object.Base.fe_dai.1 { + name "$SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PCM_NAME" + } + + Object.PCM.pcm_caps.1 { + name "$SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_STREAM_NAME" + formats 'S32_LE' + rates '16000' + channels_min 2 + channels_max 2 + } + } +] diff --git a/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature.conf b/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature.conf index 87039b261597..7d39c11772c1 100644 --- a/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature.conf +++ b/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature.conf @@ -4,6 +4,9 @@ Define { SDW_DMIC_AUDIO_FEATURE_CAPTURE_PCM_ID 48 SDW_DMIC_AUDIO_FEATURE_CAPTURE_STREAM_NAME "Microphone Audio Features Stream" SDW_DMIC_AUDIO_FEATURE_CAPTURE_PIPELINE_ID 131 + # MFCC output frame size in bytes (24-byte header + coefficients): + # Mel-only (80 bins): 24 + 80*4 = 344 + MFCC_FRAME_BYTES 344 } Object.Pipeline.host-gateway-src-mfcc-capture [ @@ -21,6 +24,9 @@ Object.Pipeline.host-gateway-src-mfcc-capture [ name "$SDW_DMIC_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC bytes" } + mixer."1" { + name "$SDW_DMIC_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC VAD" + } } } } diff --git a/tools/topology/topology2/platform/intel/sdw-jack-audio-feature-compress.conf b/tools/topology/topology2/platform/intel/sdw-jack-audio-feature-compress.conf new file mode 100644 index 000000000000..286af8be0323 --- /dev/null +++ b/tools/topology/topology2/platform/intel/sdw-jack-audio-feature-compress.conf @@ -0,0 +1,71 @@ +Define { + SDW_JACK_MODULE_COPIER_ID 11 + SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PCM_NAME "Jack In Compress Audio Features" + SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PCM_ID 53 + SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_STREAM_NAME "Jack In Compress Audio Features Stream" + SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PIPELINE_ID 132 + # MFCC compress output frame size in bytes: + # Mel-only (80 bins): 24 + 80*4 = 344 + # Cepstral (13 ceps): 24 + 13*4 = 76 + MFCC_FRAME_BYTES 344 + # MFCC config blob: mel or ceps + MFCC_BLOB mel +} + +Object.Pipeline.host-gateway-src-mfcc-capture [ + { + index $SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PIPELINE_ID + + Object.Widget.host-copier.1 { + stream_name "$SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_STREAM_NAME" + pcm_id $SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PCM_ID + } + + Object.Widget.mfcc.1 { + type "encoder" + Object.Control { + bytes."1" { + name "$SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC bytes" + IncludeByKey.MFCC_BLOB { + "mel" "include/components/mfcc/mel80_compress_dtx.conf" + "ceps" "include/components/mfcc/ceps13_compress_dtx.conf" + } + } + mixer."1" { + name "$SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC VAD" + } + } + } + } +] +Object.Base.route [ + { + source "module-copier.$SDW_JACK_MODULE_COPIER_ID.0" + sink "src.$SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PIPELINE_ID.1" + } + { + source "mfcc.$SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PIPELINE_ID.1" + sink "host-copier.$SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PCM_ID.capture" + } +] + +Object.PCM.pcm [ + { + name "$SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PCM_NAME" + id $SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PCM_ID + direction "capture" + compress "true" + + Object.Base.fe_dai.1 { + name "$SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PCM_NAME" + } + + Object.PCM.pcm_caps.1 { + name "$SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_STREAM_NAME" + formats 'S32_LE' + rates '16000' + channels_min $SDW_JACK_CAPTURE_CH + channels_max $SDW_JACK_CAPTURE_CH + } + } +] diff --git a/tools/topology/topology2/platform/intel/sdw-jack-audio-feature.conf b/tools/topology/topology2/platform/intel/sdw-jack-audio-feature.conf index 9645199d6907..a0a44eae4d87 100644 --- a/tools/topology/topology2/platform/intel/sdw-jack-audio-feature.conf +++ b/tools/topology/topology2/platform/intel/sdw-jack-audio-feature.conf @@ -4,6 +4,9 @@ Define { SDW_JACK_AUDIO_FEATURE_CAPTURE_PCM_ID 47 SDW_JACK_AUDIO_FEATURE_CAPTURE_STREAM_NAME "Jack In Audio Features Stream" SDW_JACK_AUDIO_FEATURE_CAPTURE_PIPELINE_ID 130 + # MFCC output frame size in bytes (24-byte header + coefficients): + # Mel-only (80 bins): 24 + 80*4 = 344 + MFCC_FRAME_BYTES 344 } Object.Pipeline.host-gateway-src-mfcc-capture [ @@ -21,6 +24,9 @@ Object.Pipeline.host-gateway-src-mfcc-capture [ name "$SDW_JACK_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC bytes" } + mixer."1" { + name "$SDW_JACK_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC VAD" + } } } }