diff --git a/src/audio/base_fw.c b/src/audio/base_fw.c
index b86db469765a..c5a874e41c54 100644
--- a/src/audio/base_fw.c
+++ b/src/audio/base_fw.c
@@ -100,6 +100,10 @@ static void get_codec_info(struct sof_tlv **tuple)
 	codec_info.items[codec_info.count++] =
 		SET_CODEC_INFO_ITEM(SND_AUDIOCODEC_VORBIS, SOF_IPC_STREAM_PLAYBACK);
 #endif
+#ifdef CONFIG_COMP_MFCC
+	codec_info.items[codec_info.count++] =
+		SET_CODEC_INFO_ITEM(SND_AUDIOCODEC_BESPOKE, SOF_IPC_STREAM_CAPTURE);
+#endif
 
 	if (!codec_info.count)
 		return;
diff --git a/src/audio/mfcc/CMakeLists.txt b/src/audio/mfcc/CMakeLists.txt
index f8af79d1ca8a..274c7aa05eb8 100644
--- a/src/audio/mfcc/CMakeLists.txt
+++ b/src/audio/mfcc/CMakeLists.txt
@@ -4,5 +4,8 @@ if(CONFIG_COMP_MFCC STREQUAL "m" AND DEFINED CONFIG_LLEXT)
   add_subdirectory(llext ${PROJECT_BINARY_DIR}/mfcc_llext)
   add_dependencies(app mfcc)
 else()
-  add_local_sources(sof mfcc.c mfcc_setup.c mfcc_common.c mfcc_generic.c mfcc_hifi4.c mfcc_hifi3.c)
+  add_local_sources(sof mfcc.c mfcc_setup.c mfcc_common.c mfcc_generic.c mfcc_hifi4.c mfcc_hifi3.c mfcc_vad.c)
+  if(CONFIG_IPC_MAJOR_4)
+    add_local_sources(sof mfcc_ipc4.c)
+  endif()
 endif()
diff --git a/src/audio/mfcc/mfcc.c b/src/audio/mfcc/mfcc.c
index ea09d919009b..971e088cc2cf 100644
--- a/src/audio/mfcc/mfcc.c
+++ b/src/audio/mfcc/mfcc.c
@@ -12,6 +12,8 @@
 #include <sof/audio/format.h>
 #include <sof/audio/pipeline.h>
 #include <sof/audio/ipc-config.h>
+#include <module/audio/source_api.h>
+#include <module/audio/sink_api.h>
 #include <sof/common.h>
 #include <rtos/panic.h>
 #include <sof/ipc/msg.h>
@@ -36,29 +38,31 @@ LOG_MODULE_REGISTER(mfcc, CONFIG_SOF_LOG_LEVEL);
 
 SOF_DEFINE_REG_UUID(mfcc);
 
-__cold_rodata const struct mfcc_func_map mfcc_fm[] = {
+/** \brief Source/sink API based source copy function map. */
+struct mfcc_source_func_map {
+	uint8_t source;
+	mfcc_source_func func;
+};
+
+__cold_rodata static const struct mfcc_source_func_map mfcc_sfm[] = {
 #if CONFIG_FORMAT_S16LE
-	{SOF_IPC_FRAME_S16_LE, mfcc_s16_default},
-#endif /* CONFIG_FORMAT_S16LE */
+	{SOF_IPC_FRAME_S16_LE, mfcc_source_copy_s16},
+#endif
 #if CONFIG_FORMAT_S24LE
-	{SOF_IPC_FRAME_S24_4LE, mfcc_s24_default},
-#endif /* CONFIG_FORMAT_S24LE */
+	{SOF_IPC_FRAME_S24_4LE, mfcc_source_copy_s24},
+#endif
 #if CONFIG_FORMAT_S32LE
-	{SOF_IPC_FRAME_S32_LE, mfcc_s32_default},
-#endif /* CONFIG_FORMAT_S32LE */
+	{SOF_IPC_FRAME_S32_LE, mfcc_source_copy_s32},
+#endif
 };
 
-static mfcc_func mfcc_find_func(enum sof_ipc_frame source_format,
-				enum sof_ipc_frame sink_format,
-				const struct mfcc_func_map *map,
-				int n)
+static mfcc_source_func mfcc_find_source_func(enum sof_ipc_frame source_format)
 {
 	int i;
 
-	/* Find suitable processing function from map. */
-	for (i = 0; i < n; i++) {
-		if (source_format == map[i].source)
-			return map[i].func;
+	for (i = 0; i < ARRAY_SIZE(mfcc_sfm); i++) {
+		if (source_format == mfcc_sfm[i].source)
+			return mfcc_sfm[i].func;
 	}
 
 	return NULL;
@@ -97,56 +101,47 @@ static int mfcc_free(struct processing_module *mod)
 	struct mfcc_comp_data *cd = module_get_private_data(mod);
 
 	comp_info(mod->dev, "entry");
+	ipc_msg_free(cd->msg);
+	cd->msg = NULL;
 	mod_data_blob_handler_free(mod, cd->model_handler);
 	mfcc_free_buffers(mod);
 	mod_free(mod, cd);
 	return 0;
 }
 
-static int mfcc_get_config(struct processing_module *mod,
-			   uint32_t config_id, uint32_t *data_offset_size,
-			   uint8_t *fragment, size_t fragment_size)
-{
-	struct sof_ipc_ctrl_data *cdata = (struct sof_ipc_ctrl_data *)fragment;
-	struct mfcc_comp_data *cd = module_get_private_data(mod);
-
-	comp_info(mod->dev, "entry");
-
-	return comp_data_blob_get_cmd(cd->model_handler, cdata, fragment_size);
-}
-
-static int mfcc_set_config(struct processing_module *mod, uint32_t config_id,
-			   enum module_cfg_fragment_position pos, uint32_t data_offset_size,
-			   const uint8_t *fragment, size_t fragment_size, uint8_t *response,
-			   size_t response_size)
-{
-	struct mfcc_comp_data *cd = module_get_private_data(mod);
-
-	comp_info(mod->dev, "entry");
-
-	return comp_data_blob_set(cd->model_handler, pos, data_offset_size,
-				  fragment, fragment_size);
-}
 
+/**
+ * \brief Source/sink API based process function for MFCC.
+ *
+ * Reads input audio from sof_source, runs the STFT/Mel/DCT stage, and
+ * delegates output formatting and commit handling to mfcc_common.c.
+ */
 static int mfcc_process(struct processing_module *mod,
-			struct input_stream_buffer *input_buffers, int num_input_buffers,
-			struct output_stream_buffer *output_buffers, int num_output_buffers)
+				 struct sof_source **sources, int num_of_sources,
+				 struct sof_sink **sinks, int num_of_sinks)
 {
 	struct mfcc_comp_data *cd = module_get_private_data(mod);
-	struct audio_stream *source = input_buffers->data;
-	struct audio_stream *sink = output_buffers->data;
-	int frames = input_buffers->size;
-
-	comp_dbg(mod->dev, "start");
-
-	frames = MIN(frames, cd->max_frames);
-	cd->mfcc_func(mod, input_buffers, output_buffers, frames);
-
-	/* TODO: use module_update_buffer_position() from #6194 */
-	input_buffers->consumed += audio_stream_frame_bytes(source) * frames;
-	output_buffers->size += audio_stream_frame_bytes(sink) * frames;
-	comp_dbg(mod->dev, "done");
-	return 0;
+	struct comp_dev *dev = mod->dev;
+	struct mfcc_state *state = &cd->state;
+	size_t source_avail;
+	int frames;
+	int num_ceps;
+
+	comp_dbg(dev, "start");
+	source_avail = source_get_data_frames_available(sources[0]);
+	frames = MIN(source_avail, cd->max_frames);
+	if (frames == 0)
+		return -ENODATA;
+
+	/* Copy input audio from source to MFCC internal circular buffer */
+	cd->source_func(sources[0], &state->buf, &state->emph, frames, state->source_channel);
+
+	/* Run STFT and Mel/DCT processing */
+	num_ceps = mfcc_stft_process(mod, cd);
+	if (num_ceps < 0)
+		return num_ceps;
+
+	return mfcc_process_output(mod, cd, sources, sinks, num_ceps, frames);
 }
 
 static int mfcc_prepare(struct processing_module *mod,
@@ -187,22 +182,41 @@ static int mfcc_prepare(struct processing_module *mod,
 				 audio_stream_get_channels(&sourceb->stream));
 		if (ret < 0) {
 			comp_err(dev, "setup failed.");
-			goto err;
+			return ret;
 		}
+	} else {
+		comp_err(dev, "configuration is missing.");
+		return -EINVAL;
 	}
 
-	cd->mfcc_func = mfcc_find_func(source_format, sink_format, mfcc_fm, ARRAY_SIZE(mfcc_fm));
-	if (!cd->mfcc_func) {
-		comp_err(dev, "No proc func");
-		ret = -EINVAL;
-		goto err;
+	cd->source_func = mfcc_find_source_func(source_format);
+	if (!cd->source_func) {
+		comp_err(dev, "No source func");
+		mfcc_free_buffers(mod);
+		return -EINVAL;
 	}
 
-	return 0;
+	cd->source_format = source_format;
 
-err:
-	comp_set_state(dev, COMP_TRIGGER_RESET);
-	return ret;
+	if (cd->config->compress_output)
+		comp_info(dev, "compress PCM output mode enabled");
+
+	if (cd->config->enable_dtx && !cd->config->compress_output)
+		comp_warn(dev, "enable_dtx ignored in normal PCM mode, only applies to compress");
+
+	/* Initialize VAD switch control notification if enabled */
+	if (cd->config->enable_vad && cd->config->update_controls) {
+		if (!cd->msg) {
+			ret = mfcc_ipc_notification_init(mod);
+			if (ret < 0) {
+				mfcc_free_buffers(mod);
+				return ret;
+			}
+		}
+	}
+
+	cd->vad_prev = false;
+	return 0;
 }
 
 static int mfcc_reset(struct processing_module *mod)
@@ -211,8 +225,13 @@ static int mfcc_reset(struct processing_module *mod)
 
 	comp_info(mod->dev, "entry");
 
+	/* Free MFCC buffers to prevent leaks on reset->prepare cycles.
+	 * mfcc_free_buffers() NULLs the pointers after free.
+	 */
+	mfcc_free_buffers(mod);
+
 	/* Reset to similar state as init() */
-	cd->mfcc_func = NULL;
+	cd->source_func = NULL;
 	return 0;
 }
 
@@ -221,7 +240,7 @@ static const struct module_interface mfcc_interface = {
 	.free = mfcc_free,
 	.set_configuration = mfcc_set_config,
 	.get_configuration = mfcc_get_config,
-	.process_audio_stream = mfcc_process,
+	.process = mfcc_process,
 	.prepare = mfcc_prepare,
 	.reset = mfcc_reset,
 };
diff --git a/src/audio/mfcc/mfcc_common.c b/src/audio/mfcc/mfcc_common.c
index 1079864e9259..4713df3d2566 100644
--- a/src/audio/mfcc/mfcc_common.c
+++ b/src/audio/mfcc/mfcc_common.c
@@ -7,7 +7,8 @@
 #include <sof/audio/mfcc/mfcc_comp.h>
 
 #include <sof/audio/component.h>
-#include <sof/audio/audio_stream.h>
+#include <module/audio/sink_api.h>
+#include <module/audio/source_api.h>
 #include <sof/audio/format.h>
 #include <sof/math/auditory.h>
 #include <sof/math/fft.h>
@@ -20,15 +21,156 @@
 #include <errno.h>
 #include <stddef.h>
 #include <stdint.h>
+#include <rtos/string.h>
+
+#include <sof/audio/mfcc/mfcc_vad.h>
 
 LOG_MODULE_REGISTER(mfcc_common, CONFIG_SOF_LOG_LEVEL);
 
+/*
+ * Source/sink API based source copy functions.
+ * These use sof_source API and are compiled on all platforms (generic, HiFi3, HiFi4).
+ */
+
+#if CONFIG_FORMAT_S16LE
+void mfcc_source_copy_s16(struct sof_source *source, struct mfcc_buffer *buf,
+				  struct mfcc_pre_emph *emph, int frames, int source_channel)
+{
+	int16_t const *src_ptr;
+	int16_t const *src_start;
+	int src_samples;
+	int num_channels = source_get_channels(source);
+	size_t req_bytes = frames * num_channels * sizeof(int16_t);
+	int16_t *w = buf->w_ptr;
+	int16_t const *x;
+	int32_t s;
+	int ret;
+	int i;
+
+	ret = source_get_data_s16(source, req_bytes, &src_ptr, &src_start, &src_samples);
+	if (ret)
+		return;
+
+	x = src_ptr + source_channel;
+	for (i = 0; i < frames; i++) {
+		if (emph->enable) {
+			s = (int32_t)emph->delay * emph->coef + Q_SHIFT_LEFT(*x, 15, 30);
+			*w = sat_int16(Q_SHIFT_RND(s, 30, 15));
+			emph->delay = *x;
+		} else {
+			*w = *x;
+		}
+		x += num_channels;
+		/* Wrap source pointer */
+		if (x >= src_start + src_samples)
+			x -= src_samples;
+
+		w++;
+		w = mfcc_buffer_wrap(buf, w);
+	}
+
+	buf->s_avail += frames;
+	buf->s_free -= frames;
+	buf->w_ptr = w;
+	source_release_data(source, req_bytes);
+}
+#endif /* CONFIG_FORMAT_S16LE */
+
+#if CONFIG_FORMAT_S24LE
+void mfcc_source_copy_s24(struct sof_source *source, struct mfcc_buffer *buf,
+				  struct mfcc_pre_emph *emph, int frames, int source_channel)
+{
+	int32_t const *src_ptr;
+	int32_t const *src_start;
+	int src_samples;
+	int num_channels = source_get_channels(source);
+	size_t req_bytes = frames * num_channels * sizeof(int32_t);
+	int16_t *w = buf->w_ptr;
+	int32_t const *x;
+	int32_t s, tmp;
+	int ret;
+	int i;
+
+	ret = source_get_data_s32(source, req_bytes, &src_ptr, &src_start, &src_samples);
+	if (ret)
+		return;
+
+	x = src_ptr + source_channel;
+	for (i = 0; i < frames; i++) {
+		if (emph->enable) {
+			s = (int32_t)((uint32_t)*x << 8);
+			tmp = (int32_t)emph->delay * emph->coef + Q_SHIFT(s, 31, 30);
+			*w = sat_int16(Q_SHIFT_RND(tmp, 30, 15));
+			emph->delay = sat_int16(Q_SHIFT_RND(s, 31, 15));
+		} else {
+			s = (int32_t)((uint32_t)*x << 8);
+			*w = sat_int16(Q_SHIFT_RND(s, 31, 15));
+		}
+		x += num_channels;
+		if (x >= src_start + src_samples)
+			x -= src_samples;
+
+		w++;
+		w = mfcc_buffer_wrap(buf, w);
+	}
+
+	buf->s_avail += frames;
+	buf->s_free -= frames;
+	buf->w_ptr = w;
+	source_release_data(source, req_bytes);
+}
+#endif /* CONFIG_FORMAT_S24LE */
+
+#if CONFIG_FORMAT_S32LE
+void mfcc_source_copy_s32(struct sof_source *source, struct mfcc_buffer *buf,
+				  struct mfcc_pre_emph *emph, int frames, int source_channel)
+{
+	int32_t const *src_ptr;
+	int32_t const *src_start;
+	int src_samples;
+	int num_channels = source_get_channels(source);
+	size_t req_bytes = frames * num_channels * sizeof(int32_t);
+	int16_t *w = buf->w_ptr;
+	int32_t const *x;
+	int32_t s;
+	int ret;
+	int i;
+
+	ret = source_get_data_s32(source, req_bytes, &src_ptr, &src_start, &src_samples);
+	if (ret)
+		return;
+
+	x = src_ptr + source_channel;
+	for (i = 0; i < frames; i++) {
+		if (emph->enable) {
+			s = (int32_t)emph->delay * emph->coef + Q_SHIFT(*x, 31, 30);
+			*w = sat_int16(Q_SHIFT_RND(s, 30, 15));
+			emph->delay = sat_int16(Q_SHIFT_RND(*x, 31, 15));
+		} else {
+			*w = sat_int16(Q_SHIFT_RND(*x, 31, 15));
+		}
+		x += num_channels;
+		if (x >= src_start + src_samples)
+			x -= src_samples;
+
+		w++;
+		w = mfcc_buffer_wrap(buf, w);
+	}
+
+	buf->s_avail += frames;
+	buf->s_free -= frames;
+	buf->w_ptr = w;
+	source_release_data(source, req_bytes);
+}
+#endif /* CONFIG_FORMAT_S32LE */
+
 /*
  * The main processing function for MFCC
  */
 
-static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data *cd)
+int mfcc_stft_process(struct processing_module *mod, struct mfcc_comp_data *cd)
 {
+	const struct comp_dev *dev = mod->dev;
 	struct sof_mfcc_config *config = cd->config;
 	struct mfcc_state *state = &cd->state;
 	struct mfcc_buffer *buf = &state->buf;
@@ -144,11 +286,6 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data *
 					sat_int32(Q_MULTSR_32X32(s, config->mel_scale, 23, 12, 23));
 			}
 
-			/* Store Q9.7 version in mel_spectra for s16 output mode */
-			for (j = 0; j < state->dct.num_in; j++)
-				state->mel_spectra->data[j] =
-					sat_int16(state->mel_log_32[j] >> 16);
-
 			/* Enable this to check mmax decay */
 			comp_dbg(dev, "state->mmax = %d", state->mmax);
 		} else {
@@ -169,343 +306,308 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data *
 
 			cc_count += state->dct.num_out;
 		}
-	}
 
-	return cc_count;
-}
-
-void mfcc_fill_fft_buffer(struct mfcc_state *state)
-{
-	struct mfcc_buffer *buf = &state->buf;
-	struct mfcc_fft *fft = &state->fft;
-	int32_t *d = &fft->fft_buf[fft->fft_fill_start_idx].real;
-	const int fft_elem_inc = sizeof(fft->fft_buf[0]) / sizeof(int32_t);
-	int16_t *prev = state->prev_data;
-	int16_t *prev_end = prev + state->prev_data_size;
-	int16_t *r = buf->r_ptr;
-	int copied;
-	int nmax;
-	int n;
-	int j;
+		/* Use hop counter for frame numbering (independent of VAD enable) */
+		state->header.frame_number = state->hop_count;
 
-	/* Copy overlapped samples from state buffer. The fft_buf has been
-	 * cleared by caller so imaginary part remains zero.
-	 */
-	while (prev < prev_end) {
-		*d = *prev++;
-		d += fft_elem_inc;
-	}
+		/* Run VAD on the mel log spectrum (available in both modes) */
+		if (config->enable_vad) {
+			mfcc_vad_update(&cd->vad, state->mel_log_32);
 
-	/* Copy hop size of new data from circular buffer */
-	for (copied = 0; copied < fft->fft_hop_size; copied += n) {
-		nmax = fft->fft_hop_size - copied;
-		n = mfcc_buffer_samples_without_wrap(buf, r);
-		n = MIN(n, nmax);
-		for (j = 0; j < n; j++) {
-			*d = *r++;
-			d += fft_elem_inc;
+			/* Populate data header for this output frame */
+			state->header.energy = cd->vad.energy;
+			state->header.noise_energy = cd->vad.noise_energy;
+			state->header.vad_flag = cd->vad.is_speech ? 1 : 0;
 		}
-		r = mfcc_buffer_wrap(buf, r);
-	}
 
-	buf->s_avail -= copied;
-	buf->s_free += copied;
-	buf->r_ptr = r;
+		/* Increment hop counter at end of hop processing */
+		state->hop_count++;
 
-	/* Copy for next time data back to overlap buffer */
-	d = (int32_t *)&fft->fft_buf[fft->fft_fill_start_idx + fft->fft_hop_size].real;
-	prev = state->prev_data;
-	while (prev < prev_end) {
-		*prev++ = *d;
-		d += fft_elem_inc;
-	}
-}
+		/* Send notification when VAD state changes */
+		if (config->enable_vad && config->update_controls) {
+			bool vad_now = cd->vad.is_speech;
 
-#if CONFIG_FORMAT_S16LE
-static int16_t *mfcc_sink_copy_zero_s16(const struct audio_stream *sink, int16_t *w_ptr,
-					int samples)
-{
-	int copied;
-	int nmax;
-	int n;
-
-	for (copied = 0; copied < samples; copied += n) {
-		nmax = samples - copied;
-		n = audio_stream_samples_without_wrap_s16(sink, w_ptr);
-		n = MIN(n, nmax);
-		memset(w_ptr, 0, n * sizeof(int16_t));
-		w_ptr = audio_stream_wrap(sink, w_ptr + n);
+			if (vad_now != cd->vad_prev) {
+				mfcc_send_vad_notification(mod, vad_now ? 1 : 0);
+				cd->vad_prev = vad_now;
+			}
+		}
 	}
 
-	return w_ptr;
+	return cc_count;
 }
 
-static int16_t *mfcc_sink_copy_data_s16(const struct audio_stream *sink, int16_t *w_ptr,
-					int samples, int16_t *r_ptr)
+/**
+ * \brief Write bytes into a possibly wrapped sink buffer.
+ */
+static size_t mfcc_sink_write_bytes(uint8_t **dst, uint8_t *buf_start,
+				    size_t buf_size, const uint8_t *src,
+				    size_t max_bytes)
 {
-	int copied;
-	int nmax;
-	int n;
+	uint8_t *buf_end = buf_start + buf_size;
+	size_t chunk;
 
-	for (copied = 0; copied < samples; copied += n) {
-		nmax = samples - copied;
-		n = audio_stream_samples_without_wrap_s16(sink, w_ptr);
-		n = MIN(n, nmax);
-		/* Not using memcpy_s() due to speed need */
-		memcpy(w_ptr, r_ptr, n * sizeof(int16_t));
-		w_ptr = audio_stream_wrap(sink, w_ptr + n);
-		r_ptr += n;
+	if (max_bytes == 0)
+		return 0;
+
+	chunk = MIN(max_bytes, (size_t)(buf_end - *dst));
+	memcpy(*dst, src, chunk);
+	if (chunk < max_bytes) {
+		memcpy(buf_start, src + chunk, max_bytes - chunk);
+		*dst = buf_start + (max_bytes - chunk);
+	} else {
+		*dst += chunk;
+		if (*dst >= buf_end)
+			*dst = buf_start;
 	}
 
-	return w_ptr;
+	return max_bytes;
 }
 
-void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer *bsource,
-		      struct output_stream_buffer *bsink, int frames)
+/**
+ * \brief Prepare the next MFCC output frame after STFT processing.
+ */
+static void mfcc_prepare_output(struct mfcc_state *state, int num_ceps)
 {
-	struct audio_stream *sink = bsink->data;
-	struct mfcc_comp_data *cd = module_get_private_data(mod);
-	struct mfcc_state *state = &cd->state;
-	struct mfcc_buffer *buf = &cd->state.buf;
-	uint32_t magic = MFCC_MAGIC;
-	int16_t *w_ptr = audio_stream_get_wptr(sink);
-	const int num_magic = 2;
-	int num_ceps;
-	int sink_samples;
-	int to_copy;
-
-	/* Get samples from source buffer */
-	mfcc_source_copy_s16(bsource, buf, &state->emph, frames, state->source_channel);
-
-	/* Run STFT and processing after FFT: Mel auditory filter and DCT. */
-	num_ceps = mfcc_stft_process(mod->dev, cd);
-
-	/* If new output produced, set up pointer into scratch data and mark magic pending */
-	if (num_ceps > 0) {
-		if (state->mel_only)
-			state->out_data_ptr = state->mel_spectra->data;
-		else
-			state->out_data_ptr = state->cepstral_coef->data;
-
-		state->out_remain = num_ceps;
-		state->magic_pending = true;
-	}
+	int k;
 
-	/* Write to sink, limited by period size */
-	sink_samples = frames * audio_stream_get_channels(sink);
+	if (num_ceps <= 0)
+		return;
 
-	/* Write magic word first if pending */
-	if (state->magic_pending && sink_samples >= num_magic) {
-		w_ptr = mfcc_sink_copy_data_s16(sink, w_ptr, num_magic, (int16_t *)&magic);
-		sink_samples -= num_magic;
-		state->magic_pending = false;
-	}
+	if (state->mel_only) {
+		state->out_data_ptr = state->mel_log_32;
+	} else {
+		/* Widen int16 Q9.7 cepstral coefficients to int32 Q9.23.
+		 * Safe to copy forward: cepstral_coef is in fft_out while
+		 * mel_log_32 is in fft_buf (separate scratch buffers).
+		 */
+		for (k = 0; k < num_ceps; k++)
+			state->mel_log_32[k] = (int32_t)state->cepstral_coef->data[k] << 16;
 
-	/* Write cepstral/mel data from scratch buffer */
-	to_copy = MIN(state->out_remain, sink_samples);
-	if (to_copy > 0) {
-		w_ptr = mfcc_sink_copy_data_s16(sink, w_ptr, to_copy, state->out_data_ptr);
-		state->out_data_ptr += to_copy;
-		state->out_remain -= to_copy;
-		sink_samples -= to_copy;
+		state->out_data_ptr = state->mel_log_32;
 	}
 
-	/* Zero-fill remaining sink samples */
-	w_ptr = mfcc_sink_copy_zero_s16(sink, w_ptr, sink_samples);
+	state->out_remain = num_ceps;
+	state->header_pending = true;
 }
-#endif /* CONFIG_FORMAT_S16LE */
 
-#if CONFIG_FORMAT_S24LE || CONFIG_FORMAT_S32LE
-static int32_t *mfcc_sink_copy_zero_s32(const struct audio_stream *sink, int32_t *w_ptr,
-					int samples)
+/**
+ * \brief Commit MFCC output in compress mode.
+ */
+static int mfcc_output_compress(struct processing_module *mod, struct mfcc_comp_data *cd,
+				struct sof_sink **sinks, int num_ceps)
 {
-	int copied;
-	int nmax;
-	int n;
-
-	for (copied = 0; copied < samples; copied += n) {
-		nmax = samples - copied;
-		n = audio_stream_samples_without_wrap_s32(sink, w_ptr);
-		n = MIN(n, nmax);
-		memset(w_ptr, 0, n * sizeof(int32_t));
-		w_ptr = audio_stream_wrap(sink, w_ptr + n);
+	struct comp_dev *dev = mod->dev;
+	struct mfcc_state *state = &cd->state;
+	size_t out_bytes;
+	size_t commit_bytes;
+	void *sink_ptr;
+	void *sink_start;
+	size_t sink_buf_size;
+	int ret;
+
+	if (num_ceps <= 0)
+		return 0;
+
+	out_bytes = sizeof(state->header) + num_ceps * sizeof(int32_t);
+
+	if (cd->config->enable_vad && !cd->vad.is_speech) {
+		state->vad_silence_count++;
+		/* With DTX enabled, send trailing silence frames
+		 * (configurable count) then suppress. After trailing
+		 * frames, optionally send periodic silence updates
+		 * at the configured interval. This gives the host
+		 * enough silence to detect end-of-speech while
+		 * keeping alive updates during long silence.
+		 * Without DTX, output every frame regardless of VAD.
+		 */
+		if (cd->config->enable_dtx) {
+			if (state->vad_silence_count > state->dtx_trailing_silence) {
+				/* Check periodic silence frame send */
+				if (state->dtx_silence_interval > 0) {
+					state->dtx_silence_counter++;
+					if (state->dtx_silence_counter >= state->dtx_silence_interval) {
+						state->dtx_silence_counter = 0;
+						goto send_frame;
+					}
+				}
+				state->header_pending = false;
+				state->out_remain = 0;
+				return 0;
+			}
+		}
+	} else {
+		state->vad_silence_count = 0;
+		state->dtx_silence_counter = 0;
 	}
 
-	return w_ptr;
-}
+send_frame:
+	commit_bytes = out_bytes;
 
-static int32_t *mfcc_sink_copy_data_s32(const struct audio_stream *sink, int32_t *w_ptr,
-					int samples, int32_t *r_ptr)
-{
-	int copied;
-	int nmax;
-	int n;
+	if (sink_get_free_size(sinks[0]) < commit_bytes)
+		return -ENOSPC;
 
-	for (copied = 0; copied < samples; copied += n) {
-		nmax = samples - copied;
-		n = audio_stream_samples_without_wrap_s32(sink, w_ptr);
-		n = MIN(n, nmax);
-		/* Not using memcpy_s() due to speed need */
-		memcpy(w_ptr, r_ptr, n * sizeof(int32_t));
-		w_ptr = audio_stream_wrap(sink, w_ptr + n);
-		r_ptr += n;
+	ret = sink_get_buffer(sinks[0], commit_bytes, &sink_ptr,
+			      &sink_start, &sink_buf_size);
+	if (ret)
+		return ret;
+
+	{
+		uint8_t *dst = sink_ptr;
+
+		mfcc_sink_write_bytes(&dst, sink_start, sink_buf_size,
+				      (uint8_t *)&state->header, sizeof(state->header));
+		mfcc_sink_write_bytes(&dst, sink_start, sink_buf_size,
+				      (uint8_t *)state->out_data_ptr,
+				      num_ceps * sizeof(int32_t));
 	}
 
-	return w_ptr;
+	state->header_pending = false;
+	state->out_remain = 0;
+
+	sink_commit_buffer(sinks[0], commit_bytes);
+	comp_dbg(dev, "done, produced %zu bytes", commit_bytes);
+	return 0;
 }
-#endif /* CONFIG_FORMAT_S24LE || CONFIG_FORMAT_S32LE */
 
-#if CONFIG_FORMAT_S24LE
-void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer *bsource,
-		      struct output_stream_buffer *bsink, int frames)
+/**
+ * \brief Commit MFCC output in legacy PCM mode.
+ */
+static int mfcc_output_legacy(struct processing_module *mod, struct mfcc_comp_data *cd,
+			      struct sof_source **sources, struct sof_sink **sinks,
+			      int frames)
 {
-	struct audio_stream *sink = bsink->data;
-	struct mfcc_comp_data *cd = module_get_private_data(mod);
+	struct comp_dev *dev = mod->dev;
 	struct mfcc_state *state = &cd->state;
-	struct mfcc_buffer *buf = &cd->state.buf;
-	uint32_t magic = MFCC_MAGIC;
-	int32_t *w_ptr = audio_stream_get_wptr(sink);
-	const int num_magic = 1; /* one int32_t word for magic */
-	int num_ceps;
-	int sink_samples;
-	int remain_s32;
-	int to_copy;
-	int k;
-
-	/* Get samples from source buffer */
-	mfcc_source_copy_s24(bsource, buf, &state->emph, frames, state->source_channel);
-
-	/* Run STFT and processing after FFT */
-	num_ceps = mfcc_stft_process(mod->dev, cd);
-
-	/* If new output produced, set up pointer into scratch data */
-	if (num_ceps > 0) {
-		if (state->mel_only) {
-			/* Convert mel_log_32 from Q9.23 to Q9.15 in-place */
-			for (k = 0; k < num_ceps; k++)
-				state->mel_log_32[k] >>= 8;
-
-			state->out_data_ptr_32 = state->mel_log_32;
-		} else {
-			state->out_data_ptr = state->cepstral_coef->data;
+	size_t commit_bytes;
+	void *sink_ptr;
+	void *sink_start;
+	size_t sink_buf_size;
+	int ret;
+
+	commit_bytes = sink_get_frame_bytes(sinks[0]);
+	commit_bytes *= frames;
+
+	if (sink_get_free_size(sinks[0]) < commit_bytes)
+		return -ENOSPC;
+
+	ret = sink_get_buffer(sinks[0], commit_bytes, &sink_ptr,
+			      &sink_start, &sink_buf_size);
+	if (ret)
+		return ret;
+
+	/* Zero-fill entire period first */
+	{
+		size_t bytes_to_end = (size_t)((uint8_t *)sink_start + sink_buf_size -
+					       (uint8_t *)sink_ptr);
+
+		if (bytes_to_end >= commit_bytes)
+			memset(sink_ptr, 0, commit_bytes);
+		else {
+			memset(sink_ptr, 0, bytes_to_end);
+			memset(sink_start, 0, commit_bytes - bytes_to_end);
 		}
-
-		state->out_remain = num_ceps;
-		state->magic_pending = true;
 	}
 
-	/* Write to sink, limited by period size */
-	sink_samples = frames * audio_stream_get_channels(sink);
+	{
+		uint8_t *dst = sink_ptr;
+		size_t avail = commit_bytes;
 
-	/* Write magic word first if pending */
-	if (state->magic_pending && sink_samples >= num_magic) {
-		w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_magic, (int32_t *)&magic);
-		sink_samples -= num_magic;
-		state->magic_pending = false;
-	}
+		/* Write pending header */
+		if (state->header_pending && avail > 0) {
+			size_t hdr_size = sizeof(state->header);
 
-	if (state->mel_only) {
-		/* Write 32-bit mel data Q9.15, one value per int32_t */
-		to_copy = MIN(state->out_remain, sink_samples);
-		if (to_copy > 0) {
-			w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, to_copy,
-							state->out_data_ptr_32);
-			state->out_data_ptr_32 += to_copy;
-			state->out_remain -= to_copy;
-			sink_samples -= to_copy;
+			if (avail >= hdr_size) {
+				mfcc_sink_write_bytes(&dst, sink_start, sink_buf_size,
+						      (uint8_t *)&state->header, hdr_size);
+				avail -= hdr_size;
+				state->header_pending = false;
+			}
 		}
-	} else {
-		/* Write cepstral data packed as int32_t from scratch buffer */
-		remain_s32 = (state->out_remain + 1) / 2;
-		to_copy = MIN(remain_s32, sink_samples);
-		if (to_copy > 0) {
-			w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, to_copy,
-							(int32_t *)state->out_data_ptr);
-			state->out_data_ptr += to_copy * 2;
-			state->out_remain -= to_copy * 2;
-			if (state->out_remain < 0)
-				state->out_remain = 0;
 
-			sink_samples -= to_copy;
+		/* Write pending feature data (always int32) */
+		if (state->out_remain > 0 && avail > 0) {
+			size_t data_bytes;
+			size_t to_write;
+
+			data_bytes = state->out_remain * sizeof(int32_t);
+			to_write = MIN(data_bytes, avail) & ~(size_t)3;
+			if (to_write > 0) {
+				int n32;
+
+				mfcc_sink_write_bytes(&dst, sink_start, sink_buf_size,
+						      (uint8_t *)state->out_data_ptr,
+						      to_write);
+				n32 = to_write / sizeof(int32_t);
+				state->out_data_ptr += n32;
+				state->out_remain -= n32;
+			}
 		}
 	}
 
-	/* Zero-fill remaining sink samples */
-	w_ptr = mfcc_sink_copy_zero_s32(sink, w_ptr, sink_samples);
+	sink_commit_buffer(sinks[0], commit_bytes);
+	comp_dbg(dev, "done, produced %zu bytes", commit_bytes);
+	return 0;
 }
-#endif /* CONFIG_FORMAT_S24LE */
 
-#if CONFIG_FORMAT_S32LE
-void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer *bsource,
-		      struct output_stream_buffer *bsink, int frames)
+int mfcc_process_output(struct processing_module *mod, struct mfcc_comp_data *cd,
+			 struct sof_source **sources, struct sof_sink **sinks,
+			 int num_ceps, int frames)
 {
-	struct audio_stream *sink = bsink->data;
-	struct mfcc_comp_data *cd = module_get_private_data(mod);
-	struct mfcc_state *state = &cd->state;
-	struct mfcc_buffer *buf = &cd->state.buf;
-	uint32_t magic = MFCC_MAGIC;
-	int32_t *w_ptr = audio_stream_get_wptr(sink);
-	const int num_magic = 1; /* one int32_t word for magic */
-	int num_ceps;
-	int sink_samples;
-	int remain_s32;
-	int to_copy;
-
-	/* Get samples from source buffer */
-	mfcc_source_copy_s32(bsource, buf, &state->emph, frames, state->source_channel);
-
-	/* Run STFT and processing after FFT */
-	num_ceps = mfcc_stft_process(mod->dev, cd);
-
-	/* If new output produced, set up pointer into scratch data */
-	if (num_ceps > 0) {
-		if (state->mel_only) {
-			state->out_data_ptr_32 = state->mel_log_32;
-		} else {
-			state->out_data_ptr = state->cepstral_coef->data;
-		}
+	if (num_ceps > 0)
+		mfcc_prepare_output(&cd->state, num_ceps);
 
-		state->out_remain = num_ceps;
-		state->magic_pending = true;
-	}
+	if (cd->config->compress_output)
+		return mfcc_output_compress(mod, cd, sinks, num_ceps);
 
-	/* Write to sink, limited by period size */
-	sink_samples = frames * audio_stream_get_channels(sink);
+	return mfcc_output_legacy(mod, cd, sources, sinks, frames);
+}
 
-	/* Write magic word first if pending */
-	if (state->magic_pending && sink_samples >= num_magic) {
-		w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_magic, (int32_t *)&magic);
-		sink_samples -= num_magic;
-		state->magic_pending = false;
-	}
+void mfcc_fill_fft_buffer(struct mfcc_state *state)
+{
+	struct mfcc_buffer *buf = &state->buf;
+	struct mfcc_fft *fft = &state->fft;
+	int32_t *d = &fft->fft_buf[fft->fft_fill_start_idx].real;
+	const int fft_elem_inc = sizeof(fft->fft_buf[0]) / sizeof(int32_t);
+	int16_t *prev = state->prev_data;
+	int16_t *prev_end = prev + state->prev_data_size;
+	int16_t *r = buf->r_ptr;
+	int copied;
+	int nmax;
+	int n;
+	int j;
 
-	if (state->mel_only) {
-		/* Write 32-bit mel data Q9.23, one value per int32_t */
-		to_copy = MIN(state->out_remain, sink_samples);
-		if (to_copy > 0) {
-			w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, to_copy,
-							state->out_data_ptr_32);
-			state->out_data_ptr_32 += to_copy;
-			state->out_remain -= to_copy;
-			sink_samples -= to_copy;
-		}
-	} else {
-		/* Write cepstral data packed as int32_t from scratch buffer */
-		remain_s32 = (state->out_remain + 1) / 2;
-		to_copy = MIN(remain_s32, sink_samples);
-		if (to_copy > 0) {
-			w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, to_copy,
-							(int32_t *)state->out_data_ptr);
-			state->out_data_ptr += to_copy * 2;
-			state->out_remain -= to_copy * 2;
-			if (state->out_remain < 0)
-				state->out_remain = 0;
+	/* Copy overlapped samples from state buffer. The fft_buf has been
+	 * cleared by caller so imaginary part remains zero.
+	 */
+	while (prev < prev_end) {
+		*d = *prev++;
+		d += fft_elem_inc;
+	}
 
-			sink_samples -= to_copy;
+	/* Copy hop size of new data from circular buffer */
+	for (copied = 0; copied < fft->fft_hop_size; copied += n) {
+		nmax = fft->fft_hop_size - copied;
+		n = mfcc_buffer_samples_without_wrap(buf, r);
+		n = MIN(n, nmax);
+		for (j = 0; j < n; j++) {
+			*d = *r++;
+			d += fft_elem_inc;
 		}
+		r = mfcc_buffer_wrap(buf, r);
 	}
 
-	/* Zero-fill remaining sink samples */
-	w_ptr = mfcc_sink_copy_zero_s32(sink, w_ptr, sink_samples);
+	buf->s_avail -= copied;
+	buf->s_free += copied;
+	buf->r_ptr = r;
+
+	/* Copy for next time data back to overlap buffer */
+	d = (int32_t *)&fft->fft_buf[fft->fft_fill_start_idx + fft->fft_hop_size].real;
+	prev = state->prev_data;
+	while (prev < prev_end) {
+		*prev++ = *d;
+		d += fft_elem_inc;
+	}
 }
-#endif /* CONFIG_FORMAT_S32LE */
+
+
diff --git a/src/audio/mfcc/mfcc_generic.c b/src/audio/mfcc/mfcc_generic.c
index 73ac49272ed4..d5eaf65ba091 100644
--- a/src/audio/mfcc/mfcc_generic.c
+++ b/src/audio/mfcc/mfcc_generic.c
@@ -8,7 +8,6 @@
 #ifdef MFCC_GENERIC
 
 #include <sof/audio/component.h>
-#include <sof/audio/audio_stream.h>
 #include <sof/math/auditory.h>
 #include <sof/math/icomplex16.h>
 #include <sof/math/icomplex32.h>
@@ -64,161 +63,4 @@ void mfcc_apply_window(struct mfcc_state *state, int input_shift)
 		fft->fft_buf[i + j].real = (fft->fft_buf[i + j].real * state->window[j]) << s;
 }
 
-#if CONFIG_FORMAT_S16LE
-void mfcc_source_copy_s16(struct input_stream_buffer *bsource, struct mfcc_buffer *buf,
-			  struct mfcc_pre_emph *emph, int frames, int source_channel)
-{
-	struct audio_stream *source = bsource->data;
-	int32_t s;
-	int16_t *x0;
-	int16_t *x = audio_stream_get_rptr(source);
-	int16_t *w = buf->w_ptr;
-	int copied;
-	int nmax;
-	int n1;
-	int n2;
-	int n;
-	int i;
-	int num_channels = audio_stream_get_channels(source);
-
-	/* Copy from source to pre-buffer for FFT.
-	 * The pre-emphasis filter is done in this step.
-	 */
-	for (copied = 0; copied < frames; copied += n) {
-		nmax = frames - copied;
-		n1 = audio_stream_frames_without_wrap(source, x);
-		n2 = mfcc_buffer_samples_without_wrap(buf, w);
-		n = MIN(n1, n2);
-		n = MIN(n, nmax);
-		x0 = x + source_channel;
-		for (i = 0; i < n; i++) {
-			if (emph->enable) {
-				/* Q1.15 x Q1.15 -> Q2.30 */
-				s = (int32_t)emph->delay * emph->coef + Q_SHIFT_LEFT(*x0, 15, 30);
-				*w = sat_int16(Q_SHIFT_RND(s, 30, 15));
-				emph->delay = *x0;
-			} else {
-				*w = *x0;
-			}
-			x0 += num_channels;
-			w++;
-		}
-
-		x = audio_stream_wrap(source, x + n * audio_stream_get_channels(source));
-		w = mfcc_buffer_wrap(buf, w);
-	}
-	buf->s_avail += copied;
-	buf->s_free -= copied;
-	buf->w_ptr = w;
-}
-#endif /* CONFIG_FORMAT_S16LE */
-
-#if CONFIG_FORMAT_S24LE
-
-void mfcc_source_copy_s24(struct input_stream_buffer *bsource, struct mfcc_buffer *buf,
-			  struct mfcc_pre_emph *emph, int frames, int source_channel)
-{
-	struct audio_stream *source = bsource->data;
-	int32_t tmp, s;
-	int32_t *x0;
-	int32_t *x = audio_stream_get_rptr(source);
-	int16_t *w = buf->w_ptr;
-	int copied;
-	int nmax;
-	int n1;
-	int n2;
-	int n;
-	int i;
-	int num_channels = audio_stream_get_channels(source);
-
-	/* Copy from source to pre-buffer for FFT.
-	 * The pre-emphasis filter is done in this step.
-	 * S24_4LE data is in 32-bit container, shift left by 8 to Q1.31,
-	 * then convert to Q1.15 with rounding.
-	 */
-	for (copied = 0; copied < frames; copied += n) {
-		nmax = frames - copied;
-		n1 = audio_stream_frames_without_wrap(source, x);
-		n2 = mfcc_buffer_samples_without_wrap(buf, w);
-		n = MIN(n1, n2);
-		n = MIN(n, nmax);
-		x0 = x + source_channel;
-		for (i = 0; i < n; i++) {
-			if (emph->enable) {
-				/* Convert to Q1.31, ignore highest byte */
-				s = (int32_t)((uint32_t)*x0 << 8);
-				/* Q1.15 x Q1.15 -> Q2.30 */
-				tmp = (int32_t)emph->delay * emph->coef + Q_SHIFT(s, 31, 30);
-				*w = sat_int16(Q_SHIFT_RND(tmp, 30, 15));
-				emph->delay = sat_int16(Q_SHIFT_RND(s, 31, 15));
-			} else {
-				/* Convert to Q1.31, ignore highest byte */
-				s = (int32_t)((uint32_t)*x0 << 8);
-				*w = sat_int16(Q_SHIFT_RND(s, 31, 15));
-			}
-			x0 += num_channels;
-			w++;
-		}
-
-		x = audio_stream_wrap(source, x + n * audio_stream_get_channels(source));
-		w = mfcc_buffer_wrap(buf, w);
-	}
-	buf->s_avail += copied;
-	buf->s_free -= copied;
-	buf->w_ptr = w;
-}
-
-#endif /* CONFIG_FORMAT_S24LE */
-
-#if CONFIG_FORMAT_S32LE
-
-void mfcc_source_copy_s32(struct input_stream_buffer *bsource, struct mfcc_buffer *buf,
-			  struct mfcc_pre_emph *emph, int frames, int source_channel)
-{
-	struct audio_stream *source = bsource->data;
-	int32_t s;
-	int32_t *x0;
-	int32_t *x = audio_stream_get_rptr(source);
-	int16_t *w = buf->w_ptr;
-	int copied;
-	int nmax;
-	int n1;
-	int n2;
-	int n;
-	int i;
-	int num_channels = audio_stream_get_channels(source);
-
-	/* Copy from source to pre-buffer for FFT.
-	 * The pre-emphasis filter is done in this step.
-	 * S32 data is in 32-bit container, shift right by 16 to get 16-bit.
-	 */
-	for (copied = 0; copied < frames; copied += n) {
-		nmax = frames - copied;
-		n1 = audio_stream_frames_without_wrap(source, x);
-		n2 = mfcc_buffer_samples_without_wrap(buf, w);
-		n = MIN(n1, n2);
-		n = MIN(n, nmax);
-		x0 = x + source_channel;
-		for (i = 0; i < n; i++) {
-			if (emph->enable) {
-				/* Q1.15 x Q1.15 -> Q2.30 */
-				s = (int32_t)emph->delay * emph->coef + Q_SHIFT(*x0, 31, 30);
-				*w = sat_int16(Q_SHIFT_RND(s, 30, 15));
-				emph->delay = sat_int16(Q_SHIFT_RND(*x0, 31, 15));
-			} else {
-				*w = sat_int16(Q_SHIFT_RND(*x0, 31, 15));
-			}
-			x0 += num_channels;
-			w++;
-		}
-
-		x = audio_stream_wrap(source, x + n * audio_stream_get_channels(source));
-		w = mfcc_buffer_wrap(buf, w);
-	}
-	buf->s_avail += copied;
-	buf->s_free -= copied;
-	buf->w_ptr = w;
-}
-#endif /* CONFIG_FORMAT_S32LE */
-
 #endif /* MFCC_GENERIC */
diff --git a/src/audio/mfcc/mfcc_hifi3.c b/src/audio/mfcc/mfcc_hifi3.c
index 80c384ad6c64..8b6a01e1f40d 100644
--- a/src/audio/mfcc/mfcc_hifi3.c
+++ b/src/audio/mfcc/mfcc_hifi3.c
@@ -9,7 +9,6 @@
 #ifdef MFCC_HIFI3
 
 #include <sof/audio/component.h>
-#include <sof/audio/audio_stream.h>
 #include <sof/math/auditory.h>
 #include <sof/math/icomplex16.h>
 #include <sof/math/icomplex32.h>
@@ -35,66 +34,6 @@ static inline void set_circular_buf0(const void *start, const void *end)
  * MFCC algorithm code
  */
 
-#if CONFIG_FORMAT_S16LE
-void mfcc_source_copy_s16(struct input_stream_buffer *bsource, struct mfcc_buffer *buf,
-			  struct mfcc_pre_emph *emph, int frames, int source_channel)
-{
-	struct audio_stream *source = bsource->data;
-	int copied;
-	int nmax;
-	int n;
-	int i;
-	int num_channels = audio_stream_get_channels(source);
-	ae_int16 *in;
-	ae_int16 *x = (ae_int16 *)audio_stream_get_rptr(source);
-	ae_int16 *out = (ae_int16 *)buf->w_ptr;
-	ae_int16x4 sample;
-	ae_int32x2 temp;
-	ae_int16x4 coef = emph->coef;
-	ae_int16x4 delay;
-	const int in_inc = sizeof(ae_int16) * num_channels;
-
-	/* Copy from source to pre-buffer for FFT.
-	 * The pre-emphasis filter is done in this step.
-	 */
-	for (copied = 0; copied < frames; copied += n) {
-		nmax = frames - copied;
-		n = audio_stream_frames_without_wrap(source, x);
-		n = MIN(n, nmax);
-		nmax = mfcc_buffer_samples_without_wrap(buf, (int16_t *)out);
-		n = MIN(n, nmax);
-		in = x + source_channel;
-		if (emph->enable) {
-			delay = emph->delay;
-			for (i = 0; i < n; i++) {
-				AE_L16_XP(sample, in, in_inc);
-				/* Q1.15 -> Q1.31 */
-				temp = AE_CVT32X2F16_10(sample);
-				AE_MULAF16SS_00(temp, delay, coef);
-				delay = sample;
-				sample = AE_ROUND16X4F32SSYM(temp, temp);
-				/* 2 = sizeof(ae_int16)*/
-				AE_S16_0_IP(sample, out, 2);
-			}
-			emph->delay = delay;
-
-		} else {
-			for (i = 0; i < n; i++) {
-				AE_L16_XP(sample, in, in_inc);
-				/* 2 = sizeof(ae_int16)*/
-				AE_S16_0_IP(sample, out, 2);
-			}
-		}
-
-		x = audio_stream_wrap(source, x + n * num_channels);
-		out = (ae_int16 *)mfcc_buffer_wrap(buf, (int16_t *)out);
-	}
-	buf->s_avail += copied;
-	buf->s_free -= copied;
-	buf->w_ptr = (int16_t *)out;
-}
-#endif /* CONFIG_FORMAT_S16LE */
-
 void mfcc_fill_prev_samples(struct mfcc_buffer *buf, int16_t *prev_data,
 			    int prev_data_length)
 {
@@ -152,129 +91,4 @@ void mfcc_apply_window(struct mfcc_state *state, int input_shift)
 	}
 }
 
-#if CONFIG_FORMAT_S24LE
-void mfcc_source_copy_s24(struct input_stream_buffer *bsource, struct mfcc_buffer *buf,
-			  struct mfcc_pre_emph *emph, int frames, int source_channel)
-{
-	struct audio_stream *source = bsource->data;
-	int copied;
-	int nmax;
-	int n;
-	int i;
-	int num_channels = audio_stream_get_channels(source);
-	ae_int32 *in;
-	ae_int32 *x = (ae_int32 *)audio_stream_get_rptr(source);
-	ae_int16 *out = (ae_int16 *)buf->w_ptr;
-	ae_int32x2 sample32;
-	ae_int16x4 sample;
-	ae_int32x2 temp;
-	ae_int16x4 coef = emph->coef;
-	ae_int16x4 delay;
-	const int in_inc = sizeof(ae_int32) * num_channels;
-
-	for (copied = 0; copied < frames; copied += n) {
-		nmax = frames - copied;
-		n = audio_stream_frames_without_wrap(source, x);
-		n = MIN(n, nmax);
-		nmax = mfcc_buffer_samples_without_wrap(buf, (int16_t *)out);
-		n = MIN(n, nmax);
-		in = x + source_channel;
-		if (emph->enable) {
-			delay = emph->delay;
-			for (i = 0; i < n; i++) {
-				AE_L32_XP(sample32, in, in_inc);
-				/* Shift left by 8 to sign-extend to Q1.31 */
-				sample32 = AE_SLAI32(sample32, 8);
-				/* Then shift right by 16 to get 16-bit */
-				sample32 = AE_SRAI32(sample32, 16);
-				sample = AE_SAT16X4(sample32, sample32);
-				/* Q1.15 -> Q1.31 */
-				temp = AE_CVT32X2F16_10(sample);
-				AE_MULAF16SS_00(temp, delay, coef);
-				delay = sample;
-				sample = AE_ROUND16X4F32SSYM(temp, temp);
-				AE_S16_0_IP(sample, out, 2);
-			}
-			emph->delay = delay;
-		} else {
-			for (i = 0; i < n; i++) {
-				AE_L32_XP(sample32, in, in_inc);
-				/* Shift left by 8 to sign-extend to Q1.31 */
-				sample32 = AE_SLAI32(sample32, 8);
-				/* Then shift right by 16 to get 16-bit */
-				sample32 = AE_SRAI32(sample32, 16);
-				sample = AE_SAT16X4(sample32, sample32);
-				AE_S16_0_IP(sample, out, 2);
-			}
-		}
-
-		x = audio_stream_wrap(source, x + n * num_channels);
-		out = (ae_int16 *)mfcc_buffer_wrap(buf, (int16_t *)out);
-	}
-	buf->s_avail += copied;
-	buf->s_free -= copied;
-	buf->w_ptr = (int16_t *)out;
-}
-#endif /* CONFIG_FORMAT_S24LE */
-
-#if CONFIG_FORMAT_S32LE
-void mfcc_source_copy_s32(struct input_stream_buffer *bsource, struct mfcc_buffer *buf,
-			  struct mfcc_pre_emph *emph, int frames, int source_channel)
-{
-	struct audio_stream *source = bsource->data;
-	int copied;
-	int nmax;
-	int n;
-	int i;
-	int num_channels = audio_stream_get_channels(source);
-	ae_int32 *in;
-	ae_int32 *x = (ae_int32 *)audio_stream_get_rptr(source);
-	ae_int16 *out = (ae_int16 *)buf->w_ptr;
-	ae_int32x2 sample32;
-	ae_int16x4 sample;
-	ae_int32x2 temp;
-	ae_int16x4 coef = emph->coef;
-	ae_int16x4 delay;
-	const int in_inc = sizeof(ae_int32) * num_channels;
-
-	for (copied = 0; copied < frames; copied += n) {
-		nmax = frames - copied;
-		n = audio_stream_frames_without_wrap(source, x);
-		n = MIN(n, nmax);
-		nmax = mfcc_buffer_samples_without_wrap(buf, (int16_t *)out);
-		n = MIN(n, nmax);
-		in = x + source_channel;
-		if (emph->enable) {
-			delay = emph->delay;
-			for (i = 0; i < n; i++) {
-				AE_L32_XP(sample32, in, in_inc);
-				/* S32: shift right by 16 to get 16-bit */
-				sample32 = AE_SRAI32(sample32, 16);
-				sample = AE_SAT16X4(sample32, sample32);
-				/* Q1.15 -> Q1.31 */
-				temp = AE_CVT32X2F16_10(sample);
-				AE_MULAF16SS_00(temp, delay, coef);
-				delay = sample;
-				sample = AE_ROUND16X4F32SSYM(temp, temp);
-				AE_S16_0_IP(sample, out, 2);
-			}
-			emph->delay = delay;
-		} else {
-			for (i = 0; i < n; i++) {
-				AE_L32_XP(sample32, in, in_inc);
-				sample32 = AE_SRAI32(sample32, 16);
-				sample = AE_SAT16X4(sample32, sample32);
-				AE_S16_0_IP(sample, out, 2);
-			}
-		}
-
-		x = audio_stream_wrap(source, x + n * num_channels);
-		out = (ae_int16 *)mfcc_buffer_wrap(buf, (int16_t *)out);
-	}
-	buf->s_avail += copied;
-	buf->s_free -= copied;
-	buf->w_ptr = (int16_t *)out;
-}
-#endif /* CONFIG_FORMAT_S32LE */
-
 #endif /* MFCC_HIFI3 */
diff --git a/src/audio/mfcc/mfcc_hifi4.c b/src/audio/mfcc/mfcc_hifi4.c
index 63986870793b..8cd956fcb079 100644
--- a/src/audio/mfcc/mfcc_hifi4.c
+++ b/src/audio/mfcc/mfcc_hifi4.c
@@ -9,7 +9,6 @@
 #ifdef MFCC_HIFI4
 
 #include <sof/audio/component.h>
-#include <sof/audio/audio_stream.h>
 #include <sof/math/auditory.h>
 #include <sof/math/icomplex16.h>
 #include <sof/math/icomplex32.h>
@@ -31,66 +30,10 @@ static inline void set_circular_buf0(const void *start, const void *end)
 	AE_SETCEND0(end);
 }
 
-/* Setup circular for buffer 1 */
-static inline void set_circular_buf1(const void *start, const void *end)
-{
-	AE_SETCBEGIN1(start);
-	AE_SETCEND1(end);
-}
-
 /*
  * MFCC algorithm code
  */
 
-#if CONFIG_FORMAT_S16LE
-void mfcc_source_copy_s16(struct input_stream_buffer *bsource, struct mfcc_buffer *buf,
-			  struct mfcc_pre_emph *emph, int frames, int source_channel)
-{
-	struct audio_stream *source = bsource->data;
-	int num_channels = audio_stream_get_channels(source);
-	ae_int16 *in = (ae_int16 *)source->r_ptr + source_channel;
-	ae_int16 *out = (ae_int16 *)buf->w_ptr;
-	ae_int16x4 sample;
-	ae_int32x2 temp;
-	ae_int16x4 coef;
-	ae_int16x4 delay;
-	const int in_inc = sizeof(ae_int16) * num_channels;
-	const int out_inc = sizeof(ae_int16);
-	int i;
-
-	set_circular_buf1(buf->addr, buf->end_addr);
-	set_circular_buf0(source->addr, source->end_addr);
-
-	/* Copy from source to pre-buffer for FFT.
-	 * The pre-emphasis filter is done in this step.
-	 */
-	if (emph->enable) {
-		delay = emph->delay;
-		coef = emph->coef;
-		for (i = 0; i < frames; i++) {
-			AE_L16_XC(sample, in, in_inc);
-
-			/* Q1.15 -> Q1.31 */
-			temp = AE_CVT32X2F16_10(sample);
-			AE_MULAF16SS_00(temp, delay, coef);
-			delay = sample;
-			sample = AE_ROUND16X4F32SSYM(temp, temp);
-			AE_S16_0_XC1(sample, out, out_inc);
-		}
-		emph->delay = delay;
-	} else {
-		for (i = 0; i < frames; i++) {
-			AE_L16_XC(sample, in, in_inc);
-			AE_S16_0_XC1(sample, out, out_inc);
-		}
-	}
-
-	buf->s_avail += frames;
-	buf->s_free -= frames;
-	buf->w_ptr = (int16_t *)out;
-}
-#endif /* CONFIG_FORMAT_S16LE */
-
 void mfcc_fill_prev_samples(struct mfcc_buffer *buf, int16_t *prev_data,
 			    int prev_data_length)
 {
@@ -148,111 +91,4 @@ void mfcc_apply_window(struct mfcc_state *state, int input_shift)
 	}
 }
 
-#if CONFIG_FORMAT_S24LE
-void mfcc_source_copy_s24(struct input_stream_buffer *bsource, struct mfcc_buffer *buf,
-			  struct mfcc_pre_emph *emph, int frames, int source_channel)
-{
-	struct audio_stream *source = bsource->data;
-	int num_channels = audio_stream_get_channels(source);
-	ae_int32 *in = (ae_int32 *)source->r_ptr + source_channel;
-	ae_int16 *out = (ae_int16 *)buf->w_ptr;
-	ae_int32x2 sample32;
-	ae_int16x4 sample;
-	ae_int32x2 temp;
-	ae_int16x4 coef;
-	ae_int16x4 delay;
-	const int in_inc = sizeof(ae_int32) * num_channels;
-	const int out_inc = sizeof(ae_int16);
-	int i;
-
-	set_circular_buf1(buf->addr, buf->end_addr);
-	set_circular_buf0(source->addr, source->end_addr);
-
-	if (emph->enable) {
-		delay = emph->delay;
-		coef = emph->coef;
-		for (i = 0; i < frames; i++) {
-			AE_L32_XC(sample32, in, in_inc);
-			/* Shift left by 8 to sign-extend to Q1.31 */
-			sample32 = AE_SLAI32(sample32, 8);
-			/* Then shift right by 16 to get 16-bit */
-			sample32 = AE_SRAI32(sample32, 16);
-			sample = AE_SAT16X4(sample32, sample32);
-			/* Q1.15 -> Q1.31 */
-			temp = AE_CVT32X2F16_10(sample);
-			AE_MULAF16SS_00(temp, delay, coef);
-			delay = sample;
-			sample = AE_ROUND16X4F32SSYM(temp, temp);
-			AE_S16_0_XC1(sample, out, out_inc);
-		}
-		emph->delay = delay;
-	} else {
-		for (i = 0; i < frames; i++) {
-			AE_L32_XC(sample32, in, in_inc);
-			/* Shift left by 8 to sign-extend to Q1.31 */
-			sample32 = AE_SLAI32(sample32, 8);
-			/* Then shift right by 16 to get 16-bit */
-			sample32 = AE_SRAI32(sample32, 16);
-			sample = AE_SAT16X4(sample32, sample32);
-			AE_S16_0_XC1(sample, out, out_inc);
-		}
-	}
-
-	buf->s_avail += frames;
-	buf->s_free -= frames;
-	buf->w_ptr = (int16_t *)out;
-}
-#endif /* CONFIG_FORMAT_S24LE */
-
-#if CONFIG_FORMAT_S32LE
-void mfcc_source_copy_s32(struct input_stream_buffer *bsource, struct mfcc_buffer *buf,
-			  struct mfcc_pre_emph *emph, int frames, int source_channel)
-{
-	struct audio_stream *source = bsource->data;
-	int num_channels = audio_stream_get_channels(source);
-	ae_int32 *in = (ae_int32 *)source->r_ptr + source_channel;
-	ae_int16 *out = (ae_int16 *)buf->w_ptr;
-	ae_int32x2 sample32;
-	ae_int16x4 sample;
-	ae_int32x2 temp;
-	ae_int16x4 coef;
-	ae_int16x4 delay;
-	const int in_inc = sizeof(ae_int32) * num_channels;
-	const int out_inc = sizeof(ae_int16);
-	int i;
-
-	set_circular_buf1(buf->addr, buf->end_addr);
-	set_circular_buf0(source->addr, source->end_addr);
-
-	if (emph->enable) {
-		delay = emph->delay;
-		coef = emph->coef;
-		for (i = 0; i < frames; i++) {
-			AE_L32_XC(sample32, in, in_inc);
-			/* S32: shift right by 16 to get 16-bit */
-			sample32 = AE_SRAI32(sample32, 16);
-			sample = AE_SAT16X4(sample32, sample32);
-			/* Q1.15 -> Q1.31 */
-			temp = AE_CVT32X2F16_10(sample);
-			AE_MULAF16SS_00(temp, delay, coef);
-			delay = sample;
-			sample = AE_ROUND16X4F32SSYM(temp, temp);
-			AE_S16_0_XC1(sample, out, out_inc);
-		}
-		emph->delay = delay;
-	} else {
-		for (i = 0; i < frames; i++) {
-			AE_L32_XC(sample32, in, in_inc);
-			sample32 = AE_SRAI32(sample32, 16);
-			sample = AE_SAT16X4(sample32, sample32);
-			AE_S16_0_XC1(sample, out, out_inc);
-		}
-	}
-
-	buf->s_avail += frames;
-	buf->s_free -= frames;
-	buf->w_ptr = (int16_t *)out;
-}
-#endif /* CONFIG_FORMAT_S32LE */
-
 #endif /* MFCC_HIFI4 */
diff --git a/src/audio/mfcc/mfcc_ipc4.c b/src/audio/mfcc/mfcc_ipc4.c
new file mode 100644
index 000000000000..bb20d85e413b
--- /dev/null
+++ b/src/audio/mfcc/mfcc_ipc4.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: BSD-3-Clause
+//
+// Copyright(c) 2026 Intel Corporation.
+//
+// Author: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
+
+/**
+ * \file mfcc_ipc4.c
+ * \brief IPC4-specific functions for MFCC component.
+ *
+ * Provides VAD switch control notification to user space via the
+ * IPC4 module notification mechanism.
+ */
+
+#include <sof/audio/mfcc/mfcc_comp.h>
+#include <sof/audio/module_adapter/module/generic.h>
+#include <sof/audio/component.h>
+#include <sof/ipc/msg.h>
+#include <sof/trace/trace.h>
+#include <ipc4/base-config.h>
+#include <ipc4/header.h>
+#include <ipc4/module.h>
+#include <ipc4/notification.h>
+#include <rtos/string.h>
+#include <errno.h>
+#include <stdint.h>
+
+LOG_MODULE_DECLARE(mfcc, CONFIG_SOF_LOG_LEVEL);
+
+/**
+ * \brief Initialize IPC notification message for VAD switch control.
+ *
+ * Allocates and configures the IPC message used to send VAD state
+ * change notifications to user space via a switch control.
+ */
+int mfcc_ipc_notification_init(struct processing_module *mod)
+{
+	struct mfcc_comp_data *cd = module_get_private_data(mod);
+	struct ipc_msg msg_proto;
+	struct comp_dev *dev = mod->dev;
+	struct comp_ipc_config *ipc_config = &dev->ipc_config;
+	union ipc4_notification_header *primary =
+		(union ipc4_notification_header *)&msg_proto.header;
+	struct sof_ipc4_notify_module_data *msg_module_data;
+	struct sof_ipc4_control_msg_payload *msg_payload;
+
+	memset_s(&msg_proto, sizeof(msg_proto), 0, sizeof(msg_proto));
+	primary->r.notif_type = SOF_IPC4_MODULE_NOTIFICATION;
+	primary->r.type = SOF_IPC4_GLB_NOTIFICATION;
+	primary->r.rsp = SOF_IPC4_MESSAGE_DIR_MSG_REQUEST;
+	primary->r.msg_tgt = SOF_IPC4_MESSAGE_TARGET_FW_GEN_MSG;
+	cd->msg = ipc_msg_w_ext_init(msg_proto.header, msg_proto.extension,
+				     sizeof(struct sof_ipc4_notify_module_data) +
+				     sizeof(struct sof_ipc4_control_msg_payload) +
+				     sizeof(struct sof_ipc4_ctrl_value_chan));
+	if (!cd->msg) {
+		comp_err(dev, "Failed to initialize VAD notification");
+		return -ENOMEM;
+	}
+
+	msg_module_data = (struct sof_ipc4_notify_module_data *)cd->msg->tx_data;
+	msg_module_data->instance_id = IPC4_INST_ID(ipc_config->id);
+	msg_module_data->module_id = IPC4_MOD_ID(ipc_config->id);
+	msg_module_data->event_id = SOF_IPC4_NOTIFY_MODULE_EVENTID_ALSA_MAGIC_VAL |
+		SOF_IPC4_SWITCH_CONTROL_PARAM_ID;
+	msg_module_data->event_data_size = sizeof(struct sof_ipc4_control_msg_payload) +
+		sizeof(struct sof_ipc4_ctrl_value_chan);
+
+	msg_payload = (struct sof_ipc4_control_msg_payload *)msg_module_data->event_data;
+	msg_payload->id = MFCC_CTRL_INDEX_VAD;
+	msg_payload->num_elems = 1;
+	msg_payload->chanv[0].channel = 0;
+
+	comp_dbg(dev, "VAD notification init: instance_id = 0x%08x, module_id = 0x%08x",
+		 msg_module_data->instance_id, msg_module_data->module_id);
+	return 0;
+}
+
+/**
+ * \brief Send VAD switch control notification to user space.
+ * \param mod Processing module.
+ * \param val VAD value: 1 = speech, 0 = silence.
+ */
+void mfcc_send_vad_notification(struct processing_module *mod, uint32_t val)
+{
+	struct mfcc_comp_data *cd = module_get_private_data(mod);
+	struct sof_ipc4_notify_module_data *msg_module_data;
+	struct sof_ipc4_control_msg_payload *msg_payload;
+
+	if (!cd->msg)
+		return;
+
+	msg_module_data = (struct sof_ipc4_notify_module_data *)cd->msg->tx_data;
+	msg_payload = (struct sof_ipc4_control_msg_payload *)msg_module_data->event_data;
+	msg_payload->chanv[0].value = val;
+	ipc_msg_send(cd->msg, NULL, false);
+}
+
+int mfcc_get_config(struct processing_module *mod,
+		    uint32_t config_id, uint32_t *data_offset_size,
+		    uint8_t *fragment, size_t fragment_size)
+{
+	struct sof_ipc_ctrl_data *cdata = (struct sof_ipc_ctrl_data *)fragment;
+	struct mfcc_comp_data *cd = module_get_private_data(mod);
+	struct sof_ipc4_control_msg_payload *ctl;
+
+	comp_info(mod->dev, "entry");
+
+	switch (config_id) {
+	case SOF_IPC4_SWITCH_CONTROL_PARAM_ID:
+		ctl = (struct sof_ipc4_control_msg_payload *)fragment;
+		if (ctl->id == MFCC_CTRL_INDEX_VAD && ctl->num_elems == 1) {
+			ctl->chanv[0].value = cd->vad_prev ? 1 : 0;
+			*data_offset_size = sizeof(*ctl) + sizeof(ctl->chanv[0]);
+			return 0;
+		}
+		return -EINVAL;
+	default:
+		return comp_data_blob_get_cmd(cd->model_handler, cdata, fragment_size);
+	}
+}
+
+int mfcc_set_config(struct processing_module *mod, uint32_t config_id,
+		    enum module_cfg_fragment_position pos, uint32_t data_offset_size,
+		    const uint8_t *fragment, size_t fragment_size, uint8_t *response,
+		    size_t response_size)
+{
+	struct mfcc_comp_data *cd = module_get_private_data(mod);
+
+	comp_info(mod->dev, "entry");
+
+	switch (config_id) {
+	case SOF_IPC4_SWITCH_CONTROL_PARAM_ID:
+		/* VAD switch is read-only, ignore set requests */
+		return 0;
+	default:
+		return comp_data_blob_set(cd->model_handler, pos, data_offset_size,
+					  fragment, fragment_size);
+	}
+}
diff --git a/src/audio/mfcc/mfcc_setup.c b/src/audio/mfcc/mfcc_setup.c
index 1cad4b2b984e..cc673d29b0da 100644
--- a/src/audio/mfcc/mfcc_setup.c
+++ b/src/audio/mfcc/mfcc_setup.c
@@ -18,6 +18,8 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#include <sof/audio/mfcc/mfcc_vad.h>
+
 /* Definitions for cepstral lifter */
 #define PI_Q23 Q_CONVERT_FLOAT(3.1415926536, 23)
 #define TWO_PI_Q23 Q_CONVERT_FLOAT(6.2831853072, 23)
@@ -127,6 +129,11 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i
 		return -EINVAL;
 	}
 
+	if (sample_rate > MFCC_MAX_SAMPLE_RATE) {
+		comp_err(dev, "Sample rate %d exceeds max %d Hz", sample_rate, MFCC_MAX_SAMPLE_RATE);
+		return -EINVAL;
+	}
+
 	if (config->sample_frequency != sample_rate) {
 		comp_err(dev, "Config sample_frequency does not match stream");
 		return -EINVAL;
@@ -328,15 +335,18 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i
 
 	/* Check that output data can be drained within the periods spanned by one
 	 * FFT hop. Each hop consumes fft_hop_size input samples and produces
-	 * max_out_per_hop + 2 (magic) int16_t output values. The sink provides at
-	 * least fft_hop_size * channels int16_t samples per hop (worst case s16).
+	 * max_out_per_hop + header int32_t output values. The sink provides
+	 * at least fft_hop_size * channels int32_t samples per hop (worst case s32).
 	 * If output exceeds this, data accumulates and will eventually overflow.
+	 * This check is not needed in compress output mode where only actual data
+	 * bytes are committed without zero padding.
 	 */
-	int out_per_hop = max_out_per_hop + 2;
+	int out_per_hop = max_out_per_hop + sizeof(state->header) / sizeof(int32_t);
 	int sink_per_hop = fft->fft_hop_size * channels;
+	bool skip_size_check = config->compress_output;
 
-	if (out_per_hop > sink_per_hop) {
-		comp_err(dev, "Output %d int16 per hop exceeds sink capacity %d (hop %d x ch %d)",
+	if (!skip_size_check && out_per_hop > sink_per_hop) {
+		comp_err(dev, "Output %d int32 per hop exceeds sink capacity %d (hop %d x ch %d)",
 			 out_per_hop, sink_per_hop, fft->fft_hop_size, channels);
 		ret = -EINVAL;
 		goto free_lifter;
@@ -345,10 +355,24 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i
 	/* Set initial state for STFT */
 	state->waiting_fill = true;
 	state->prev_samples_valid = false;
-	state->magic_pending = false;
+	state->header_pending = false;
+	state->hop_count = 0;
+	memset(&state->header, 0, sizeof(state->header));
+	state->header.magic = MFCC_MAGIC;
 	state->out_data_ptr = NULL;
-	state->out_data_ptr_32 = NULL;
 	state->out_remain = 0;
+	state->vad_silence_count = 0;
+	state->dtx_trailing_silence = config->dtx_trailing_silence_hops;
+	state->dtx_silence_interval = config->dtx_silence_hops_interval;
+	state->dtx_silence_counter = 0;
+
+	if (config->enable_vad) {
+		ret = mfcc_vad_init(&cd->vad, config->num_mel_bins, sample_rate, mod);
+		if (ret < 0) {
+			comp_err(dev, "Failed VAD init");
+			goto free_lifter;
+		}
+	}
 
 	comp_dbg(dev, "done");
 	return 0;
@@ -378,15 +402,27 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i
 	return ret;
 }
 
+static void mfcc_free_and_null(struct processing_module *mod, void **ptr)
+{
+	mod_free(mod, *ptr);
+	*ptr = NULL;
+}
+
+/* Free MFCC buffers to prevent leaks on reset->prepare cycles.
+ * mfcc_free_buffers() NULLs the pointers after free.
+ */
 void mfcc_free_buffers(struct processing_module *mod)
 {
 	struct mfcc_comp_data *cd = module_get_private_data(mod);
 
 	mod_fft_plan_free(mod, cd->state.fft.fft_plan);
-	mod_free(mod, cd->state.fft.fft_buf);
-	mod_free(mod, cd->state.fft.fft_out);
-	mod_free(mod, cd->state.buffers);
-	mod_free(mod, cd->state.melfb.data);
-	mod_free(mod, cd->state.dct.matrix);
-	mod_free(mod, cd->state.lifter.matrix);
+	cd->state.fft.fft_plan = NULL;
+	mfcc_free_and_null(mod, (void **)&cd->state.fft.fft_buf);
+	mfcc_free_and_null(mod, (void **)&cd->state.fft.fft_out);
+	mfcc_free_and_null(mod, (void **)&cd->state.buffers);
+	mfcc_free_and_null(mod, (void **)&cd->state.melfb.data);
+	mfcc_free_and_null(mod, (void **)&cd->state.dct.matrix);
+	mfcc_free_and_null(mod, (void **)&cd->state.lifter.matrix);
+	mfcc_free_and_null(mod, (void **)&cd->vad.noise_floor);
+	mfcc_free_and_null(mod, (void **)&cd->vad.weights);
 }
diff --git a/src/audio/mfcc/mfcc_vad.c b/src/audio/mfcc/mfcc_vad.c
new file mode 100644
index 000000000000..f44a89a7dea3
--- /dev/null
+++ b/src/audio/mfcc/mfcc_vad.c
@@ -0,0 +1,229 @@
+// SPDX-License-Identifier: BSD-3-Clause
+//
+// Copyright(c) 2026 Intel Corporation.
+//
+// Author: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
+
+/**
+ * \file mfcc_vad.c
+ * \brief Voice Activity Detection based on Mel spectrum energy.
+ *
+ * Implements a VAD that tracks per-bin noise floor and computes a
+ * speech-frequency weighted energy above the floor. Speech is declared
+ * when the weighted delta exceeds a threshold, with hangover to prevent
+ * rapid toggling.
+ */
+
+#include <sof/audio/mfcc/mfcc_vad.h>
+
+#include <sof/audio/component.h>
+#include <sof/audio/format.h>
+#include <sof/audio/module_adapter/module/generic.h>
+#include <sof/math/auditory.h>
+#include <sof/trace/trace.h>
+#include <errno.h>
+#include <stddef.h>
+
+LOG_MODULE_DECLARE(mfcc, CONFIG_SOF_LOG_LEVEL);
+
+/**
+ * \brief A-weighting table: 1/3 octave band center frequencies in Hz (Q16.0).
+ *
+ * From IEC 61672-1:2013, source:
+ * https://acousticalengineer.com/a-weighting-table/
+ */
+#define A_WEIGHT_TABLE_SIZE	36
+
+static const int16_t a_weight_hz[A_WEIGHT_TABLE_SIZE] = {
+	    6,     8,    10,    13,    16,    20,    25,    32,
+	   40,    50,    63,    80,   100,   125,   160,   200,
+	  250,   315,   400,   500,   630,   800,  1000,  1250,
+	 1600,  2000,  2500,  3150,  4000,  5000,  6300,  8000,
+	10000, 12500, 16000, 20000,
+};
+
+/**
+ * \brief A-weighting linear amplitude, scaled so peak (at 2500 Hz) maps
+ *        to INT16_MAX (32767).  Original dB values converted via
+ *        10^(dB/20) then scaled by 32767 / max.
+ */
+static const int16_t a_weight_lin[A_WEIGHT_TABLE_SIZE] = {
+	    2,     4,     9,    19,    43,    85,   162,   299,
+	  531,   862,  1382,  2140,  3129,  4370,  6172,  8136,
+	10362, 13196, 16234, 19518, 22669, 25730, 28212, 30230,
+	31655, 32392, 32767, 32392, 31655, 30230, 27889, 24856,
+	21156, 17196, 13045,  9670,
+};
+
+/**
+ * \brief Compute A-weighted speech-frequency emphasis weights for Mel bins.
+ *
+ * Weights are computed by linearly interpolating the A-weighting table
+ * at each Mel bin center frequency.  Output weights are in Q1.15 and
+ * sum to approximately 2^15.
+ *
+ * \param[out] weights Output weight array.
+ * \param[in] num_mel Number of Mel bins.
+ * \param[in] sample_rate Sample rate in Hz.
+ */
+static void mfcc_vad_build_weights(int16_t *weights, int num_mel, int32_t sample_rate)
+{
+	int32_t scaled, num;
+	int32_t sum = 0;
+	int16_t f_hz, f0, f1, w, w0, w1, den;
+	int16_t mel_end = psy_hz_to_mel((int16_t)(sample_rate / 2)); /* Nyquist (max 32767 Hz) in Mel */
+	int16_t mel_step = mel_end / (num_mel + 1);
+	int i, j;
+
+	if (!num_mel)
+		return;
+
+	for (i = 0; i < num_mel; i++) {
+		f_hz = psy_mel_to_hz((int16_t)((i + 1) * mel_step));
+
+		/* Find the table interval containing f_hz and interpolate */
+		if (f_hz <= a_weight_hz[0]) {
+			w = a_weight_lin[0];
+		} else if (f_hz >= a_weight_hz[A_WEIGHT_TABLE_SIZE - 1]) {
+			w = a_weight_lin[A_WEIGHT_TABLE_SIZE - 1];
+		} else {
+			/* Find j such that a_weight_hz[j] <= f_hz < a_weight_hz[j+1] */
+			for (j = 0; j < A_WEIGHT_TABLE_SIZE - 2; j++) {
+				if (f_hz < a_weight_hz[j + 1])
+					break;
+			}
+
+			/* Linear interpolation: w = w0 + (w1 - w0) * (f - f0) / (f1 - f0) */
+			f0 = a_weight_hz[j];
+			f1 = a_weight_hz[j + 1];
+			w0 = a_weight_lin[j];
+			w1 = a_weight_lin[j + 1];
+			num = (int32_t)(w1 - w0) * (f_hz - f0);
+			den = f1 - f0;
+			w = w0 + (int16_t)(num / den);
+		}
+
+		weights[i] = w;
+		sum += w;
+	}
+
+	/* Normalize weights so they sum to 1.0 */
+	for (i = 0; i < num_mel; i++) {
+		scaled = ((int32_t)weights[i] << 16) / sum; /* Q1.16 */
+		weights[i] = (int16_t)Q_SHIFT_RND(scaled, 16, 15); /* Round to Q1.15 */
+	}
+}
+
+int mfcc_vad_init(struct mfcc_vad_state *vad, int num_mel_bins, int32_t sample_rate,
+		  struct processing_module *mod)
+{
+	if (!vad)
+		return -EINVAL;
+
+	if (num_mel_bins <= 0)
+		return -EINVAL;
+
+	vad->num_mel_bins = num_mel_bins;
+	vad->energy_threshold = MFCC_VAD_ENERGY_THRESHOLD;
+	vad->noise_rise_alpha_slow = MFCC_VAD_NOISE_RISE_ALPHA;
+	vad->noise_rise_alpha_fast = MFCC_VAD_NOISE_RISE_ALPHA_FAST;
+	vad->hangover_max = MFCC_VAD_HANGOVER_FRAMES;
+	vad->hangover_counter = 0;
+	vad->init_frames = MFCC_VAD_NOISE_INIT_FRAMES;
+	vad->frame_count = 0;
+	vad->is_speech = false;
+	vad->initialized = false;
+
+	/* Allocate per-bin noise floor */
+	vad->noise_floor = mod_zalloc(mod, num_mel_bins * sizeof(int32_t));
+	if (!vad->noise_floor)
+		return -ENOMEM;
+
+	/* Allocate and compute per-bin weights */
+	vad->weights = mod_zalloc(mod, num_mel_bins * sizeof(int16_t));
+	if (!vad->weights) {
+		mod_free(mod, vad->noise_floor);
+		vad->noise_floor = NULL;
+		return -ENOMEM;
+	}
+
+	mfcc_vad_build_weights(vad->weights, num_mel_bins, sample_rate);
+	return 0;
+}
+
+int mfcc_vad_update(struct mfcc_vad_state *vad, const int32_t *mel_log)
+{
+	int64_t signal_energy = 0;
+	int64_t noise_energy = 0;
+	int64_t energy_delta = 0;
+	int32_t delta;
+	int32_t p;
+	int16_t alpha;
+	int i;
+
+	if (!vad || !mel_log)
+		return 0;
+
+	/* Stop incrementing after init phase to avoid wrap-around restarting fast alpha.
+	 * Select rise alpha based on convergence phase.
+	 */
+	if (vad->frame_count < vad->init_frames) {
+		vad->frame_count++;
+		alpha = vad->noise_rise_alpha_fast;
+	} else {
+		alpha = vad->noise_rise_alpha_slow;
+	}
+
+	/* Initialize noise floor to first frame */
+	if (!vad->initialized) {
+		for (i = 0; i < vad->num_mel_bins; i++)
+			vad->noise_floor[i] = mel_log[i];
+
+		vad->initialized = true;
+	}
+
+	/* Update noise floor: follow down instantly, rise slowly */
+	for (i = 0; i < vad->num_mel_bins; i++) {
+		if (mel_log[i] < vad->noise_floor[i]) {
+			/* Instant follow-down */
+			vad->noise_floor[i] = mel_log[i];
+		} else {
+			/* Slow rise: floor += alpha * (mel - floor)
+			 * Q9.23 + Q1.15 * Q9.23 => need Q9.23 result
+			 * alpha is Q1.15, delta is Q9.23
+			 */
+			delta = mel_log[i] - vad->noise_floor[i];
+			p = (int32_t)Q_MULTSR_32X32((int64_t)alpha, delta, 15, 23, 23);
+			vad->noise_floor[i] += p;
+		}
+	}
+
+	/* Compute weighted signal energy and noise floor energy.
+	 * weights are Q1.15, mel values are Q9.23
+	 * Products are Q10.38, accumulate in int64_t then shift to Q9.23
+	 */
+
+	for (i = 0; i < vad->num_mel_bins; i++) {
+		signal_energy += (int64_t)vad->weights[i] * mel_log[i];
+		noise_energy += (int64_t)vad->weights[i] * vad->noise_floor[i];
+	}
+
+	vad->energy = sat_int32(Q_SHIFT_RND(signal_energy, 38, 23));
+	vad->noise_energy = sat_int32(Q_SHIFT_RND(noise_energy, 38, 23));
+	energy_delta = vad->energy - vad->noise_energy;
+
+	/* Round accumulated energy from Q10.38 to Q9.23, saturate to int32 */
+	if (energy_delta > vad->energy_threshold) {
+		vad->hangover_counter = vad->hangover_max;
+		vad->is_speech = true;
+	} else {
+		if (vad->hangover_counter > 0) {
+			vad->hangover_counter--;
+			vad->is_speech = true;
+		} else {
+			vad->is_speech = false;
+		}
+	}
+
+	return vad->is_speech ? 1 : 0;
+}
diff --git a/src/audio/mfcc/tune/README.md b/src/audio/mfcc/tune/README.md
new file mode 100644
index 000000000000..f825afb758a6
--- /dev/null
+++ b/src/audio/mfcc/tune/README.md
@@ -0,0 +1,189 @@
+# SOF MFCC Tuning Tools
+
+This directory contains a tool to create configuration blob for SOF
+MFCC component. It's simply run in Matlab or Octave with command
+`setup_mfcc`. The MFCC configuration parameters can be edited from the
+script.
+
+## Testbench
+
+The configuration can be test run with testbench. First the test topologies
+need to be created with `scripts/build-tools.sh -t`. Next the testbench
+is built with `scripts/rebuild-testbench.sh`.
+
+Once the previous steps are done, a sample wav file can be processed
+with script `run_mfcc.sh`. The script converts the input to raw 16 kHz
+stereo format and runs the testbench for S16, S24, and S32 bit depths,
+producing both cepstral coefficient (MFCC) and Mel spectrogram outputs.
+
+```
+./run_mfcc.sh /usr/share/sounds/alsa/Front_Center.wav
+```
+
+Output files from host testbench:
+
+| File | Content |
+|------|---------|
+| `mfcc_s16.raw`, `mfcc_s24.raw`, `mfcc_s32.raw` | Cepstral coefficients |
+| `mel_s16.raw`, `mel_s24.raw`, `mel_s32.raw` | Mel spectrogram |
+
+If the `XTENSA_PATH` environment variable is set, the script also runs
+the Xtensa build of the testbench (via `xt-run`) and produces additional
+output files prefixed with `xt_`:
+
+| File | Content |
+|------|---------|
+| `xt_mfcc_s16.raw`, `xt_mfcc_s24.raw`, `xt_mfcc_s32.raw` | Cepstral coefficients |
+| `xt_mel_s16.raw`, `xt_mel_s24.raw`, `xt_mel_s32.raw` | Mel spectrogram |
+
+## Decoding and Plotting
+
+All output files can be decoded and plotted at once in Matlab or Octave
+with the `decode_all.m` script:
+
+```matlab
+decode_all
+```
+
+This calls `decode_ceps` for each MFCC file (13 cepstral coefficients) and
+`decode_mel` for each Mel file (80 Mel bins), plotting spectrograms for all
+files that exist including the Xtensa variants.
+
+Individual files can also be decoded manually:
+
+```matlab
+[ceps, t, n] = decode_ceps('mfcc_s16.raw', 13);
+```
+
+In the above it's known from configuration script that MFCC was set up to
+output 13 cepstral coefficients from each FFT → Mel → DCT → Cepstral
+coefficients computation run.
+
+The 80 bands Mel output can be visualized with command:
+
+```matlab
+[mel, t, n] = decode_mel('mel_s16.raw', 80);
+```
+
+## Live Whisper Transcription with DSP VAD
+
+The directory contains a Python script `sof_mel_to_text_live_dsp_vad.py`.
+It can be used with development topologies
+`sof-arl-cs42l43-l0-cs35l56-l23-mfcc.tplg` and
+`sof-mtl-rt713-l0-rt1316-l12-mfcc.tplg`. It captures from default audio
+device `hw:0,47` (headset microphone) Mel audio features and VAD flags.
+The captured frames with detected speech are sent to Whisper speech
+recognizer model for conversion to text.
+
+### Prerequisites
+
+The script needs OpenVINO. Please follow the install procedure from
+<https://docs.openvino.ai/2025/get-started/install-openvino.html>.
+
+The following Python pip installs are needed into the same OpenVINO venv:
+
+```bash
+pip install openvino openvino-tokenizers openvino-genai
+pip install optimum[intel]
+pip install transformers
+pip install huggingface_hub
+```
+
+### NPU / GPU Support
+
+The script by default runs the Whisper encoder model in the NPU. To
+use the NPU, install the driver from
+<https://github.com/intel/linux-npu-driver/releases>. If the NPU is not
+available, change the encoder to CPU with run option `--encoder-device CPU`.
+With a GPU both `--encoder-device GPU` and `--decoder-device GPU` can be set.
+
+### Example run
+
+Check which capture devices are available.
+
+```bash
+arecord -l
+```
+
+In this example the devices hw:0,47 and hw:0,48 support the audio
+features stream.
+
+```bash
+**** List of CAPTURE Hardware Devices ****
+card 0: sofsoundwire [sof-soundwire], device 1: Jack In (*) []
+  Subdevices: 1/1
+  Subdevice #0: subdevice #0
+card 0: sofsoundwire [sof-soundwire], device 4: Microphone (*) []
+  Subdevices: 1/1
+  Subdevice #0: subdevice #0
+card 0: sofsoundwire [sof-soundwire], device 47: Jack In Audio Features (*) []
+  Subdevices: 1/1
+  Subdevice #0: subdevice #0
+card 0: sofsoundwire [sof-soundwire], device 48: Microphone Audio Features (*) []
+  Subdevices: 1/1
+  Subdevice #0: subdevice #0
+```
+
+With Whisper model run the CPU and with internal microphones of laptop
+the run command is:
+
+```bash
+python3 sof_mel_to_text_live_dsp_vad.py --encoder-device CPU --device hw:0,48
+```
+
+The script run output is shown below
+
+```bash
+=== Live SOF Mel → Whisper Transcription (DSP VAD) ===
+
+Starting capture: arecord -D hw:0,48 -f S32_LE -c 2 -r 16000 -t raw --buffer-size 8192
+VAD source: DSP (embedded in stream)
+Silence trigger: 100ms (10 frames)
+Whisper model: whisper-medium-int4-ov (encoder: CPU, decoder: CPU)
+
+  [   0.01s] SILENCE
+  [   1.39s] SPEECH
+  [   2.57s] SILENCE
+  [   2.66s] Transcribing 118 frames (1.2s)...
+  [Whisper] encoder: 1.30s
+  [Whisper] decoder: 0.59s (3 tokens)
+
+  >> "Hello computer"
+```
+
+## Live Spectrogram Viewers
+
+### Mel Spectrogram
+
+The `sof_mel_spectrogram_compress.py` script captures Mel spectrogram
+frames from a SOF compress PCM device and displays them as a live
+scrolling spectrogram with VAD status. This is a lightweight viewer
+that does not run Whisper inference.
+
+```bash
+python3 sof_mel_spectrogram_compress.py --card 0 --device 48 --width 300
+```
+
+### Cepstral Spectrogram
+
+The `sof_ceps_spectrogram_compress.py` script is similar but displays
+cepstral coefficients (MFCC) instead of Mel bands.
+
+```bash
+python3 sof_ceps_spectrogram_compress.py --card 0 --device 48 --num-ceps 13 --width 300
+```
+
+## Live Whisper Transcription with Compress PCM
+
+The `sof_mel_to_text_live_compress.py` script captures Mel spectrogram
+frames from a SOF compress PCM device and performs live Whisper
+transcription using OpenVINO. Unlike `sof_mel_to_text_live_dsp_vad.py`
+which uses `arecord`, this script reads directly from the compress PCM
+device with DTX-aware frame handling.
+
+```bash
+python3 sof_mel_to_text_live_compress.py --card 0 --device 48 --model whisper-medium-int4-ov
+```
+
+The same OpenVINO prerequisites and pip packages apply as described above
+for `sof_mel_to_text_live_dsp_vad.py`.
diff --git a/src/audio/mfcc/tune/README.txt b/src/audio/mfcc/tune/README.txt
deleted file mode 100644
index a0c3189e81a3..000000000000
--- a/src/audio/mfcc/tune/README.txt
+++ /dev/null
@@ -1,52 +0,0 @@
-This directory contains a tool to create configuration blob for SOF
-MFCC component. It's simply run in Matlab or Octave with command
-"setup_mfcc". The MFCC configuration parameters can be edited from the
-script.
-
-The configuration can be test run with testbench. First the test topologies
-need to be created with "scripts/build-tools.sh -t". Next the testbench
-is build with "scripts/rebuild-testbench.sh".
-
-Once the previous steps are done, a sample wav file can be processed
-with script run_mfcc.sh. The script converts the input to raw 16 kHz
-stereo format and runs the testbench for S16, S24, and S32 bit depths,
-producing both cepstral coefficient (MFCC) and Mel spectrogram outputs.
-
-./run_mfcc.sh /usr/share/sounds/alsa/Front_Center.wav
-
-Output files from host testbench:
-  mfcc_s16.raw, mfcc_s24.raw, mfcc_s32.raw   - cepstral coefficients
-  mel_s16.raw, mel_s24.raw, mel_s32.raw       - Mel spectrogram
-
-If the XTENSA_PATH environment variable is set, the script also runs
-the Xtensa build of the testbench (via xt-run) and produces additional
-output files prefixed with "xt_":
-  xt_mfcc_s16.raw, xt_mfcc_s24.raw, xt_mfcc_s32.raw
-  xt_mel_s16.raw, xt_mel_s24.raw, xt_mel_s32.raw
-
-All output files can be decoded and plotted at once in Matlab or Octave
-with the decode_all.m script:
-
-decode_all
-
-This calls decode_ceps for each MFCC file (13 cepstral coefficients) and
-decode_mel for each Mel file (80 Mel bins), plotting spectrograms for all
-files that exist including the Xtensa variants.
-
-Individual files can also be decoded manually:
-
-[ceps, t, n] = decode_ceps('mfcc_s16.raw', 13);
-
-In the above it's known from configuration script that MFCC was set up to
-output 13 cepstral coefficients from each FFT -> Mel -> DCT -> Cepstral
-coefficients computation run.
-
-The 80 bands Mel output can be visualized with command:
-
-[mel, t, n] = decode_mel('mel_s16.raw', 80);
-
-Other kind of signals have quite big visual difference in audio features. Try
-e.g. other sound files found in computer.
-
-./run_mfcc.sh /usr/share/sounds/gnome/default/alerts/bark.ogg
-./run_mfcc.sh /usr/share/sounds/gnome/default/alerts/sonar.ogg
diff --git a/src/audio/mfcc/tune/decode_all.m b/src/audio/mfcc/tune/decode_all.m
index d5b60289b4cf..4c377bf5029a 100644
--- a/src/audio/mfcc/tune/decode_all.m
+++ b/src/audio/mfcc/tune/decode_all.m
@@ -6,12 +6,11 @@
 num_ceps = 13;
 num_mel = 80;
 
-% MFCC cepstral output files
+% MFCC cepstral output files (all int32 output, Q9.23)
 ceps_files = {'mfcc_s16.raw', 'mfcc_s24.raw', 'mfcc_s32.raw'};
 
-% Mel output files with corresponding format
+% Mel output files (all int32 output, Q9.23)
 mel_files = {'mel_s16.raw', 'mel_s24.raw', 'mel_s32.raw'};
-mel_fmts  = {'s16',         's24',          's32'};
 
 % Xtensa prefixed variants
 xt_ceps_files = {'xt_mfcc_s16.raw', 'xt_mfcc_s24.raw', 'xt_mfcc_s32.raw'};
@@ -19,21 +18,21 @@
 
 all_ceps_files = [ceps_files, xt_ceps_files];
 all_mel_files  = [mel_files, xt_mel_files];
-all_mel_fmts   = [mel_fmts, mel_fmts];
 
 for i = 1:length(all_ceps_files)
 	fn = all_ceps_files{i};
 	if exist(fn, 'file')
 		fprintf('Decoding MFCC ceps: %s\n', fn);
-		[ceps, t, n] = decode_ceps(fn, num_ceps);
+		[ceps, t, n, vad, energy, noise_energy, frame_num] = ...
+			decode_ceps(fn, num_ceps);
 	end
 end
 
 for i = 1:length(all_mel_files)
 	fn = all_mel_files{i};
-	fmt = all_mel_fmts{i};
 	if exist(fn, 'file')
 		fprintf('Decoding Mel: %s\n', fn);
-		[mel, t, n] = decode_mel(fn, num_mel, fmt);
+		[mel, t, n, vad, energy, noise_energy, frame_num] = ...
+			decode_mel(fn, num_mel);
 	end
 end
diff --git a/src/audio/mfcc/tune/decode_ceps.m b/src/audio/mfcc/tune/decode_ceps.m
index a63677fa3731..480eadea2945 100644
--- a/src/audio/mfcc/tune/decode_ceps.m
+++ b/src/audio/mfcc/tune/decode_ceps.m
@@ -1,57 +1,112 @@
-% [ceps, t, n] = decode_ceps(fn, num_ceps, num_channels)
+% [ceps, t, n, vad, energy, noise_energy, frame_number] = decode_ceps(fn, num_ceps, hop, num_channels)
 %
 % Input
 %   fn - File with MFCC data in .raw or .wav format
 %   num_ceps - number of cepstral coefficients per frame
-%   num_channels - needed for .raw format, omit for .wav
+%   hop - STFT hop in seconds, defaults to 10e-3 for 10 ms
+%   num_channels - needed for .raw format, omit for .wav, default 1
 %
 % Outputs
 %   ceps - cepstral coefficients
 %   t - time vector for plotting
 %   n - ceps 1..num_ceps vector for plotting
+%   vad - VAD flag per frame from DSP
+%   energy - weighted signal energy per frame from DSP
+%   noise_energy - weighted noise floor energy per frame from DSP
+%   frame_number - frame number from DSP
 
 % SPDX-License-Identifier: BSD-3-Clause
-% Copyright(c) 2022 Intel Corporation. All rights reserved.
+% Copyright(c) 2022-2026 Intel Corporation. All rights reserved.
 
-function [ceps, t, n] = decode_ceps(fn, num_ceps, num_channels)
+function [ceps, t, n, vad, energy, noise_energy, frame_number] = ...
+	decode_ceps(fn, num_ceps, hop, num_channels)
 
 if nargin < 3
+	hop = 10e-3;
+end
+if nargin < 4
 	num_channels = 1;
 end
 
 % MFCC stream
-fs = 16e3;
-qformat = 7;
-magic = [25443 28006]; % ASCII 'mfcc' as int16
+qformat = 23; % Q9.23 in int32
+magic = int32(1835426659); % 0x6D666363 as int32
+num_magic = 1; % magic word is 1 x int32
 
-% Load output data
+% Load output data (always int32)
 [data, num_channels] = get_file(fn, num_channels);
 
-idx1 = find(data == magic(1));
-idx = [];
-for i = 1:length(idx1)
-	if data(idx1(i) + 1) == magic(2)
-		idx = [idx idx1(i)];
-	end
-end
+idx = find(data == magic);
 
 if isempty(idx)
 	error('No magic value markers found from stream');
 end
 
-period_ceps = idx(2)-idx(1);
 num_frames = length(idx);
-t_ceps = period_ceps / num_channels / fs;
-t = (0:num_frames -1) * t_ceps;
-n = 1:num_ceps;
 
-ceps = zeros(num_ceps, num_frames);
+% Header after magic is [frame_number, reserved, energy, noise_energy, vad_flag]
+% as int32, followed by num_ceps coefficients (int32).
+payload_len = 5 + num_ceps;
+
+% Last frame can be incomplete due to span over multiple periods
+last = idx(end) + num_magic + payload_len - 1;
+if (last > length(data))
+    num_frames = num_frames - 1;
+end
+
+payload = zeros(payload_len, num_frames);
 for i = 1:num_frames
-	i1 = idx(i) + 2;
-	i2 = i1 + num_ceps - 1;
-	ceps(:,i) = data(i1:i2) / 2^qformat;
+	i1 = idx(i) + num_magic;
+	i2 = i1 + payload_len - 1;
+	payload(:,i) = double(data(i1:i2));
 end
 
+frame_number = payload(1, :);
+% payload(2,:) is reserved, skip
+energy = payload(3, :) / 2^23;
+noise_energy = payload(4, :) / 2^23;
+vad = payload(5, :);
+ceps = payload(6:payload_len, :) / 2^qformat;
+
+% Fill gaps from DTX-suppressed VAD=0 frames to create continuous timeline.
+% Missing frames are filled with the minimum ceps value found in the data.
+first_frame = frame_number(1);
+last_frame = frame_number(end);
+total_frames = last_frame - first_frame + 1;
+if total_frames > num_frames
+	ceps_fill = min(ceps(:));
+	ceps_full = ones(num_ceps, total_frames) * ceps_fill;
+	vad_full = zeros(1, total_frames);
+	energy_full = zeros(1, total_frames);
+	noise_energy_full = zeros(1, total_frames);
+	frame_number_full = first_frame:last_frame;
+	has_data = false(1, total_frames);
+	for i = 1:num_frames
+		fi = frame_number(i) - first_frame + 1;
+		ceps_full(:, fi) = ceps(:, i);
+		vad_full(fi) = vad(i);
+		energy_full(fi) = energy(i);
+		noise_energy_full(fi) = noise_energy(i);
+		has_data(fi) = true;
+	end
+	% Forward-fill gaps with last received values
+	for fi = 2:total_frames
+		if ~has_data(fi)
+			ceps_full(:, fi) = ceps_full(:, fi - 1);
+			energy_full(fi) = energy_full(fi - 1);
+			noise_energy_full(fi) = noise_energy_full(fi - 1);
+		end
+	end
+	ceps = ceps_full;
+	vad = vad_full;
+	energy = energy_full;
+	noise_energy = noise_energy_full;
+	frame_number = frame_number_full;
+end
+
+t = (frame_number - first_frame) * hop;
+n = 1:num_ceps;
+
 figure;
 surf(t, n, ceps, 'EdgeColor', 'none');
 colormap(jet);
@@ -70,18 +125,18 @@
 switch lower(ext)
 	case '.raw'
 		fh = fopen(fn, 'r');
-		data = fread(fh, 'int16');
+		data = fread(fh, 'int32');
 		fclose(fh);
 	case '.wav'
 		tmp = audioread(fn, 'native');
 		t = whos('tmp');
-		if ~strcmp(t.class, 'int16');
-			error('Only 16-bit wav file format is supported');
+		if ~strcmp(t.class, 'int32')
+			error('Expected 32-bit wav for int32 MFCC output format');
 		end
 		s = size(tmp);
 		num_channels = s(2);
 		if num_channels > 1
-			data = int16(zeros(prod(s), 1));
+			data = int32(zeros(prod(s), 1));
 			for i = 1:num_channels
 				data(i:num_channels:end) = tmp(:, i);
 			end
diff --git a/src/audio/mfcc/tune/decode_mel.m b/src/audio/mfcc/tune/decode_mel.m
index f6a723aa2040..0aca1e35ec8d 100644
--- a/src/audio/mfcc/tune/decode_mel.m
+++ b/src/audio/mfcc/tune/decode_mel.m
@@ -1,23 +1,28 @@
-% [mel, t, n] = decode_mel(fn, num_mel, fmt, num_channels)
+% [mel, t, n, vad, energy, noise_energy, frame_number] = decode_mel(fn, num_mel, hop, num_channels)
 %
 % Input
 %   fn - File with Mel data in .raw or .wav format
+%   hop - STFT hop in seconds, defaults to 10e-3 for 10 ms
 %   num_mel - number of Mel coefficients per frame
-%   fmt - format of the Mel data ('s16', 's24', 's32')
-%   num_channels - needed for .raw format, omit for .wav
+%   num_channels - needed for .raw format, omit for .wav, default 1
 %
 % Outputs
 %   mel - Mel coefficients
 %   t - time vector for plotting
 %   n - mel 1..num_mel vector for plotting
+%   vad - VAD flag per frame from DSP
+%   energy - weighted signal energy per frame from DSP
+%   noise_energy - weighted noise floor energy per frame from DSP
+%   frame_number - frame number from DSP
 
 % SPDX-License-Identifier: BSD-3-Clause
 % Copyright(c) 2026 Intel Corporation.
 
-function [mel, t, n] = decode_mel(fn, num_mel, fmt, num_channels)
+function [mel, t, n, vad, energy, noise_energy, frame_number] = ...
+	decode_mel(fn, num_mel, hop, num_channels)
 
 if nargin < 3
-	fmt = 's16';
+	hop = 10e-3;
 end
 if nargin < 4
 	num_channels = 1;
@@ -25,42 +30,21 @@
 
 % MFCC stream
 fs = 16e3;
+qformat = 23; % Q9.23 in int32
 
-switch fmt
-  case 's16'
-    qformat = 7;
-    magic = [25443 28006]; % ASCII 'mfcc' as two int16
-    num_magic = 2;
-  case 's24'
-    qformat = 15;
-    magic = int32(1835426659); % 0x6D666363 as int32
-    num_magic = 1;
-  case 's32'
-    qformat = 23;
-    magic = int32(1835426659); % 0x6D666363 as int32
-    num_magic = 1;
-    otherwise
-    error("Use 's16', 's24', or 's32' as format.");
-end
+magic = int32(1835426659); % 0x6D666363 as int32
+num_magic = 1; % magic word is 1 x int32
+num_other_header = 5; % frame_number, reserved, energy, noise, vad (all int32)
 
-% Load output data
-[data, num_channels] = get_file(fn, num_channels, fmt);
-
-if strcmp(fmt, 's16')
-	idx1 = find(data == magic(1));
-	idx = [];
-	for i = 1:length(idx1)
-		next_word = idx1(i) + 1;
-		if next_word <= length(data)
-			if data(next_word) == magic(2)
-				idx = [idx idx1(i)];
-			end
-		end
-	end
-else
-	idx = find(data == magic);
+% Load output data (always int32)
+[data, num_channels] = get_file(fn, num_channels);
+
+if isempty(data)
+	error('File %s is empty', fn);
 end
 
+idx = find(data == magic);
+
 if isempty(idx)
 	error('No magic value markers found from stream');
 end
@@ -68,65 +52,112 @@
 period_mel = idx(2)-idx(1);
 num_frames = length(idx);
 
+% Header after magic is [frame_number, reserved, energy, noise_energy, vad_flag]
+% as int32, followed by num_mel coefficients.
+% For s16 each int32 occupies 2 int16 slots.
+payload_len = num_other_header + num_mel;
+
 % Last frame can be incomplete due to span over multiple periods
-last = idx(end) + num_mel - 1;
+last = idx(end) + num_magic + payload_len - 1;
 if (last > length(data))
     num_frames = num_frames - 1;
 end
 
-t_mel = period_mel / num_channels / fs;
-t = (0:num_frames -1) * t_mel;
-n = 1:num_mel;
-
-mel = zeros(num_mel, num_frames);
+payload = zeros(payload_len, num_frames);
 for i = 1:num_frames
 	i1 = idx(i) + num_magic;
-	i2 = i1 + num_mel - 1;
-	mel(:,i) = double(data(i1:i2)) / 2^qformat;
+	i2 = i1 + payload_len - 1;
+	payload(:,i) = double(data(i1:i2));
 end
 
-figure;
+frame_number = payload(1, :);
+% payload(2,:) is reserved, skip
+energy = payload(3, :) / 2^23;
+noise_energy = payload(4, :) / 2^23;
+vad = payload(5, :);
+mel = payload(6:payload_len, :) / 2^qformat;
+
+% Fill gaps from DTX-suppressed VAD=0 frames to create continuous timeline.
+% Missing frames are filled with the minimum Mel value found in the data.
+first_frame = frame_number(1);
+last_frame = frame_number(end);
+total_frames = last_frame - first_frame + 1;
+if total_frames > num_frames
+	mel_fill = min(mel(:));
+	mel_full = ones(num_mel, total_frames) * mel_fill;
+	vad_full = zeros(1, total_frames);
+	energy_full = zeros(1, total_frames);
+	noise_energy_full = zeros(1, total_frames);
+	frame_number_full = first_frame:last_frame;
+	has_data = false(1, total_frames);
+	for i = 1:num_frames
+		fi = frame_number(i) - first_frame + 1;
+		mel_full(:, fi) = mel(:, i);
+		vad_full(fi) = vad(i);
+		energy_full(fi) = energy(i);
+		noise_energy_full(fi) = noise_energy(i);
+		has_data(fi) = true;
+	end
+	% Forward-fill gaps with last received values
+	for fi = 2:total_frames
+		if ~has_data(fi)
+			mel_full(:, fi) = mel_full(:, fi - 1);
+			energy_full(fi) = energy_full(fi - 1);
+			noise_energy_full(fi) = noise_energy_full(fi - 1);
+		end
+	end
+	mel = mel_full;
+	vad = vad_full;
+	energy = energy_full;
+	noise_energy = noise_energy_full;
+	frame_number = frame_number_full;
+end
+
+t = (frame_number - first_frame) * hop;
+n = 1:num_mel;
+
+figure
 imagesc(t, n, mel);
 axis xy;
 colormap(jet);
 colorbar;
 tstr = sprintf('SOF MFCC Mel coefficients (%s)', fn);
 title(tstr, 'Interpreter', 'None');
-xlabel('Time (s)');
 ylabel('Mel coef #');
 
+figure
+subplot(2,1,1);
+plot(t, vad)
+ax = axis();
+axis([ax(1:2) -0.1 1.1]);
+grid on;
+title(tstr, 'Interpreter', 'None');
+xlabel('Time (s)');
+ylabel('VAD flag');
+
+subplot(2,1,2);
+plot(t, energy, t, noise_energy);
+grid on;
+legend('Energy', 'Noise Energy');
+xlabel('Time (s)');
+ylabel('Energy');
+
 end
 
-function [data, num_channels] = get_file(fn, num_channels, fmt)
+function [data, num_channels] = get_file(fn, num_channels)
 
 [~, ~, ext] = fileparts(fn);
 
-switch fmt
-	case 's16'
-		read_fmt = 'int16';
-	case {'s24', 's32'}
-		read_fmt = 'int32';
-	otherwise
-		error("Use 's16', 's24', or 's32' as format.");
-end
-
 switch lower(ext)
 	case '.raw'
 		fh = fopen(fn, 'r');
-		data = fread(fh, read_fmt);
+		data = fread(fh, 'int32');
 		fclose(fh);
 	case '.wav'
 		tmp = audioread(fn, 'native');
 		t = whos('tmp');
-		switch fmt
-			case 's16'
-				if ~strcmp(t.class, 'int16')
-					error('Expected 16-bit wav for s16 format');
-				end
-			case {'s24', 's32'}
-				if ~strcmp(t.class, 'int32')
-					error('Expected 32-bit wav for %s format', fmt);
-				end
+		if ~strcmp(t.class, 'int32')
+			error('Expected 32-bit wav for int32 MFCC output format');
 		end
 		s = size(tmp);
 		num_channels = s(2);
diff --git a/src/audio/mfcc/tune/setup_mfcc.m b/src/audio/mfcc/tune/setup_mfcc.m
index bd2b3f11e60b..dbf69587a74f 100644
--- a/src/audio/mfcc/tune/setup_mfcc.m
+++ b/src/audio/mfcc/tune/setup_mfcc.m
@@ -25,6 +25,32 @@ function setup_mfcc()
 	setup.tplg_fn = 'mel80.conf';
 	export_mfcc_setup(gen_cfg, setup);
 
+	% Blob for mel spectrogram with compress PCM output
+	setup = get_mel_spectrogram_config();
+	setup.compress_output = true;
+	setup.tplg_fn = 'mel80_compress.conf';
+	export_mfcc_setup(gen_cfg, setup);
+
+	% Blob for mel spectrogram with compress PCM output and DTX
+	setup = get_mel_spectrogram_config();
+	setup.compress_output = true;
+	setup.enable_dtx = true;
+	setup.dtx_trailing_silence_hops = 20;
+	setup.dtx_silence_hops_interval = 500;
+	setup.tplg_fn = 'mel80_compress_dtx.conf';
+	export_mfcc_setup(gen_cfg, setup);
+
+	% Default MFCC (cepstral) with compress PCM output
+	setup = get_mfcc_default_config();
+	setup.compress_output = true;
+	setup.enable_vad = true;
+	setup.enable_dtx = true;
+	setup.dtx_trailing_silence_hops = 20;
+	setup.dtx_silence_hops_interval = 500;
+	setup.update_controls = true;
+	setup.tplg_fn = 'ceps13_compress_dtx.conf';
+	export_mfcc_setup(gen_cfg, setup);
+
 end
 
 function cfg = get_mfcc_default_config()
@@ -62,6 +88,12 @@ function setup_mfcc()
 	cfg.mmax_init = 0; % same
 	cfg.mmax_coef = 0; % same
 	cfg.dynamic_mmax = false; % same
+	cfg.enable_vad = false;
+	cfg.enable_dtx = false;
+	cfg.dtx_trailing_silence_hops = 0;
+	cfg.dtx_silence_hops_interval = 0;
+	cfg.update_controls = false;
+	cfg.compress_output = false;
 end
 
 function cfg = get_mel_spectrogram_config()
@@ -99,6 +131,12 @@ function setup_mfcc()
 	cfg.mmax_init = 0; % Initial value max Mel value, data clamp is mmax - top_db
 	cfg.mmax_coef = 0; % Dynamic max Mel value decay coefficient (zero lock to found max)
 	cfg.dynamic_mmax = true;
+	cfg.enable_vad = true;
+	cfg.enable_dtx = false;
+	cfg.dtx_trailing_silence_hops = 0;
+	cfg.dtx_silence_hops_interval = 0;
+	cfg.update_controls = true;
+	cfg.compress_output = false;
 end
 
 function export_mfcc_setup(gen_cfg, cfg)
@@ -107,7 +145,7 @@ function export_mfcc_setup(gen_cfg, cfg)
 addpath([gen_cfg.tools_path 'tune/common']);
 
 %% Blob size, size plus reserved(8) + current parameters
-nbytes_data = 104;
+nbytes_data = 116;
 
 %% Little endian
 sh32 = [0 -8 -16 -24];
@@ -133,8 +171,10 @@ function export_mfcc_setup(gen_cfg, cfg)
 v = q_convert(cfg.mmax_init, 7);                 [b8, j] = add_w16b(v, b8, j);
 v = q_convert(cfg.mmax_coef, 15);                [b8, j] = add_w16b(v, b8, j);
 
+v = cfg.dtx_trailing_silence_hops;                [b8, j] = add_w16b(v, b8, j); % DTX trailing silence hops
+v = cfg.dtx_silence_hops_interval;                [b8, j] = add_w16b(v, b8, j); % DTX silence frame interval
 % Reserved
-for i = 1:6
+for i = 1:5
 	[b8, j] = add_w32b(0, b8, j);
 end
 
@@ -160,6 +200,10 @@ function export_mfcc_setup(gen_cfg, cfg)
 v = 0;                                           [b8, j] = add_w16b(v, b8, j); % vtln_high Qx.y TBD
 v = 0;                                           [b8, j] = add_w16b(v, b8, j); % vtln_low Qx.y TBD
 v = 0;                                           [b8, j] = add_w16b(v, b8, j); % vtln_warp Qx.y TBD
+% reserved16[3]
+for i = 1:3
+	[b8, j] = add_w16b(0, b8, j);
+end
 v = cfg.htk_compat;                              [b8, j] = add_w8b(v, b8, j); % bool
 v = cfg.raw_energy;                              [b8, j] = add_w8b(v, b8, j); % bool
 v = cfg.remove_dc_offset;                        [b8, j] = add_w8b(v, b8, j); % bool
@@ -168,6 +212,14 @@ function export_mfcc_setup(gen_cfg, cfg)
 v = cfg.subtract_mean;                           [b8, j] = add_w8b(v, b8, j); % bool
 v = cfg.use_energy;                              [b8, j] = add_w8b(v, b8, j); % bool
 v = cfg.dynamic_mmax;                            [b8, j] = add_w8b(v, b8, j); % bool
+v = cfg.enable_vad;                              [b8, j] = add_w8b(v, b8, j); % bool
+v = cfg.enable_dtx;                              [b8, j] = add_w8b(v, b8, j); % bool
+v = cfg.update_controls;                         [b8, j] = add_w8b(v, b8, j); % bool
+v = cfg.compress_output;                         [b8, j] = add_w8b(v, b8, j); % bool
+% reserved_bool[4]
+for i = 1:4
+	[b8, j] = add_w8b(0, b8, j);
+end
 
 %% Export
 tplg_fn = [gen_cfg.mfcc_conf_path cfg.tplg_fn];
diff --git a/src/audio/mfcc/tune/sof_ceps_spectrogram_compress.py b/src/audio/mfcc/tune/sof_ceps_spectrogram_compress.py
new file mode 100644
index 000000000000..3a61641c0812
--- /dev/null
+++ b/src/audio/mfcc/tune/sof_ceps_spectrogram_compress.py
@@ -0,0 +1,234 @@
+"""Live scrolling cepstral coefficient viewer for SOF compress PCM capture.
+
+Displays a real-time scrolling MFCC (cepstral coefficient) plot and VAD
+strip from ALSA compress PCM capture (crecord) with embedded DSP VAD flag.
+
+Frame format: [magic(int32), frame_number(uint32), reserved(int32),
+               energy(int32), noise_energy(int32), vad_flag(int32),
+               ceps[0..N-1](int32)]
+
+Cepstral coefficients are in Q9.23 fixed-point format.
+
+Usage:
+    python sof_ceps_spectrogram_compress.py [--card 0] [--device 48]
+    python sof_ceps_spectrogram_compress.py --num-ceps 13 --width 300
+"""
+
+import argparse
+import os
+import queue
+import struct
+import subprocess
+import threading
+import numpy as np
+import matplotlib
+matplotlib.use('TkAgg')
+import matplotlib.pyplot as plt
+
+# SOF compress frame format constants (with DSP data header)
+SOF_MAGIC_BYTES = struct.pack('<i', 0x6D666363)  # ASCII 'mfcc' as int32
+SOF_NUM_HEADER = 6            # magic, frame_number, reserved, energy, noise_energy, vad_flag
+SOF_HEADER_BYTES = SOF_NUM_HEADER * 4  # 24 bytes
+SOF_HOP_SEC = 0.01           # 10 ms per STFT hop
+
+SPECTROGRAM_WIDTH = 300       # default number of frames visible
+DEFAULT_NUM_CEPS = 13         # default cepstral coefficients
+Q_FORMAT = 23                 # Q9.23 fixed-point
+
+
+def decode_ceps_frame(raw_ints):
+    """Convert int32 Q9.23 cepstral coefficients to float32."""
+    return raw_ints.astype(np.float32) / (2 ** Q_FORMAT)
+
+
+def parse_frame(buf, num_ceps):
+    """Parse one complete ceps frame from a bytearray buffer.
+
+    Frame layout: [magic(4B), frame_number(4B), reserved(4B), energy(4B),
+                   noise_energy(4B), vad_flag(4B), ceps[0..N-1](N*4B)]
+
+    Mutates buf in-place (deletes consumed bytes).
+    Returns: (frame_number, vad_flag, ceps_ints) or (None, None, None)
+    """
+    frame_bytes = SOF_HEADER_BYTES + num_ceps * 4
+    idx = buf.find(SOF_MAGIC_BYTES)
+    if idx < 0:
+        if len(buf) > 3:
+            del buf[:-3]
+        return None, None, None
+    end = idx + frame_bytes
+    if end > len(buf):
+        del buf[:idx]
+        return None, None, None
+
+    frame_number = struct.unpack_from('<I', buf, idx + 4)[0]
+    vad_flag = struct.unpack_from('<i', buf, idx + 20)[0]
+
+    ceps_bytes = bytes(buf[idx + SOF_HEADER_BYTES:end])
+    ceps_ints = np.frombuffer(ceps_bytes, dtype=np.int32)
+    del buf[:end]
+    return frame_number, vad_flag, ceps_ints
+
+
+def run_spectrogram(card, device, width, num_ceps):
+    """Capture compress PCM ceps frames and display scrolling spectrogram."""
+
+    frame_bytes = SOF_HEADER_BYTES + num_ceps * 4
+
+    ceps_buf = np.zeros((num_ceps, width), dtype=np.float32)
+    vad_buf = np.zeros(width, dtype=np.float32)
+    x = np.arange(width)
+
+    fig, (ax_ceps, ax_vad) = plt.subplots(
+        2, 1, figsize=(12, 5),
+        gridspec_kw={'height_ratios': [5, 1]},
+        sharex=True
+    )
+    fig.tight_layout(pad=2.0)
+
+    im_ceps = ax_ceps.imshow(
+        ceps_buf, aspect='auto', origin='lower',
+        interpolation='nearest', cmap='turbo',
+        vmin=-50.0, vmax=50.0
+    )
+    ax_ceps.set_ylabel('Cepstral coefficient')
+    ax_ceps.set_title(f'SOF MFCC Cepstral Coefficients ({num_ceps} ceps, compress PCM) — DSP VAD')
+    fig.colorbar(im_ceps, ax=ax_ceps, fraction=0.02, pad=0.02)
+
+    line_vad, = ax_vad.plot(
+        x, vad_buf, color='green', linewidth=1.5,
+        drawstyle='steps-post')
+    ax_vad.set_ylabel('VAD')
+    ax_vad.set_xlabel('Frame')
+    ax_vad.set_ylim(-0.1, 1.1)
+    ax_vad.set_yticks([0, 1])
+    ax_vad.set_yticklabels(['Silent', 'Speech'])
+
+    plt.ion()
+    plt.show(block=False)
+    fig.canvas.draw()
+    fig.canvas.flush_events()
+
+    # Start crecord capture
+    crecord_cmd = [
+        'crecord', '-v',
+        '-c', str(card),
+        '-d', str(device),
+        '-I', 'BESPOKE',
+        '-R', '16000',
+        '-C', '2',
+        '-F', 'S32_LE',
+    ]
+    cmd = ['stdbuf', '-o0'] + crecord_cmd
+
+    print(f"Starting compress capture: {' '.join(crecord_cmd)}")
+    print(f"Cepstral coefficients: {num_ceps} (frame size: {frame_bytes} bytes)")
+    print(f"Spectrogram width: {width} frames ({width * SOF_HOP_SEC:.1f}s)")
+    print()
+
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+                            bufsize=0)
+
+    frame_q = queue.Queue()
+
+    def reader_thread():
+        buf = bytearray()
+        raw_fd = proc.stdout.fileno()
+        try:
+            while True:
+                data = os.read(raw_fd, frame_bytes * 4)
+                if not data:
+                    break
+                buf.extend(data)
+                while True:
+                    frame_number, vad_flag, ceps_ints = parse_frame(buf, num_ceps)
+                    if ceps_ints is None:
+                        break
+                    frame_q.put((frame_number, vad_flag, ceps_ints))
+        except (OSError, ValueError):
+            pass
+        frame_q.put(None)
+
+    reader = threading.Thread(target=reader_thread, daemon=True)
+    reader.start()
+
+    recv_frames = 0
+    prev_speech = None
+
+    try:
+        while True:
+            try:
+                item = frame_q.get(timeout=0.05)
+            except queue.Empty:
+                fig.canvas.flush_events()
+                continue
+
+            if item is None:
+                stderr_out = proc.stderr.read().decode(errors='replace')
+                rc = proc.wait()
+                print(f"\ncrecord exited with code {rc}")
+                if stderr_out:
+                    print(f"stderr: {stderr_out}")
+                break
+
+            frame_number, vad_flag, ceps_ints = item
+            recv_frames += 1
+            ceps = decode_ceps_frame(ceps_ints)
+            speech = vad_flag != 0
+
+            if speech != prev_speech:
+                t = frame_number * SOF_HOP_SEC
+                tag = "SPEECH" if speech else "SILENCE"
+                print(f"  [{t:7.2f}s] {tag} (hop {frame_number})", flush=True)
+            prev_speech = speech
+
+            # Scroll left and append new frame
+            ceps_buf[:, :-1] = ceps_buf[:, 1:]
+            ceps_buf[:, -1] = ceps
+            vad_buf[:-1] = vad_buf[1:]
+            vad_buf[-1] = 1.0 if speech else 0.0
+
+            # Batch update: refresh plot every few frames to reduce overhead
+            if recv_frames % 3 == 0 or not speech:
+                im_ceps.set_data(ceps_buf)
+                line_vad.set_ydata(vad_buf)
+                fig.canvas.draw_idle()
+                fig.canvas.flush_events()
+
+    except (KeyboardInterrupt, BrokenPipeError):
+        pass
+    finally:
+        if proc.poll() is None:
+            proc.terminate()
+            try:
+                proc.wait(timeout=3)
+            except subprocess.TimeoutExpired:
+                proc.kill()
+                proc.wait()
+        print(f"\nCapture stopped. Received {recv_frames} frames.")
+        try:
+            plt.close(fig)
+        except Exception:
+            pass
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Live scrolling MFCC cepstral coefficient viewer "
+                    "from SOF compress PCM capture")
+    parser.add_argument('--card', '-c', type=int, default=0,
+                        help='ALSA card number (default: 0)')
+    parser.add_argument('--device', '-d', type=int, default=54,
+                        help='ALSA compress device number (default: 54)')
+    parser.add_argument('--width', '-w', type=int, default=SPECTROGRAM_WIDTH,
+                        help=f'Spectrogram width in frames (default: {SPECTROGRAM_WIDTH})')
+    parser.add_argument('--num-ceps', '-n', type=int, default=DEFAULT_NUM_CEPS,
+                        help=f'Number of cepstral coefficients (default: {DEFAULT_NUM_CEPS})')
+    args = parser.parse_args()
+
+    print(f"=== SOF MFCC Cepstral Coefficient Viewer (Compress PCM) ===\n")
+    run_spectrogram(args.card, args.device, args.width, args.num_ceps)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/audio/mfcc/tune/sof_mel_spectrogram_compress.py b/src/audio/mfcc/tune/sof_mel_spectrogram_compress.py
new file mode 100644
index 000000000000..559d28fc895b
--- /dev/null
+++ b/src/audio/mfcc/tune/sof_mel_spectrogram_compress.py
@@ -0,0 +1,227 @@
+"""Live scrolling mel spectrogram viewer for SOF compress PCM capture.
+
+Displays a real-time scrolling mel spectrogram and VAD strip from ALSA
+compress PCM capture (crecord) with embedded DSP VAD flag. No Whisper
+inference — this is a lightweight visualization tool.
+
+Frame format: [magic(int32), frame_number(uint32), reserved(int32),
+               energy(int32), noise_energy(int32), vad_flag(int32),
+               mel[0..79](int32)]
+
+Usage:
+    python sof_mel_spectrogram_compress.py [--card 0] [--device 48]
+    python sof_mel_spectrogram_compress.py --width 200  # wider spectrogram
+"""
+
+import argparse
+import os
+import queue
+import struct
+import subprocess
+import threading
+import numpy as np
+import matplotlib
+matplotlib.use('TkAgg')
+import matplotlib.pyplot as plt
+
+# SOF compress mel frame format constants (with DSP data header)
+SOF_MAGIC_BYTES = struct.pack('<i', 0x6D666363)  # ASCII 'mfcc' as int32
+SOF_NUM_HEADER = 6            # magic, frame_number, reserved, energy, noise_energy, vad_flag
+SOF_Q_FORMAT = 23            # Q9.23 fixed-point
+SOF_NUM_MEL = 80
+SOF_FRAME_INTS = SOF_NUM_HEADER + SOF_NUM_MEL  # 86 int32 per frame
+SOF_FRAME_BYTES = SOF_FRAME_INTS * 4  # 344 bytes per frame
+SOF_HOP_SEC = 0.01           # 10 ms per STFT hop
+
+SPECTROGRAM_WIDTH = 300       # default number of frames visible
+
+
+def decode_mel_frame(raw_ints):
+    """Convert 80 int32 Q9.23 values to float32 mel coefficients."""
+    return raw_ints.astype(np.float32) / (2 ** SOF_Q_FORMAT)
+
+
+def parse_frame(buf):
+    """Parse one complete mel frame from a bytearray buffer.
+
+    Frame layout: [magic(4B), frame_number(4B), reserved(4B), energy(4B),
+                   noise_energy(4B), vad_flag(4B), mel[0..79](320B)] = 344 bytes
+
+    Mutates buf in-place (deletes consumed bytes).
+    Returns: (frame_number, vad_flag, mel_ints) or (None, None, None)
+    """
+    idx = buf.find(SOF_MAGIC_BYTES)
+    if idx < 0:
+        if len(buf) > 3:
+            del buf[:-3]
+        return None, None, None
+    end = idx + SOF_FRAME_BYTES
+    if end > len(buf):
+        del buf[:idx]
+        return None, None, None
+
+    frame_number = struct.unpack_from('<I', buf, idx + 4)[0]
+    vad_flag = struct.unpack_from('<i', buf, idx + 20)[0]
+
+    mel_bytes = bytes(buf[idx + SOF_NUM_HEADER * 4:end])
+    mel_ints = np.frombuffer(mel_bytes, dtype=np.int32)
+    del buf[:end]
+    return frame_number, vad_flag, mel_ints
+
+
+def run_spectrogram(card, device, width):
+    """Capture compress PCM mel frames and display scrolling spectrogram."""
+
+    mel_buf = np.zeros((SOF_NUM_MEL, width), dtype=np.float32)
+    vad_buf = np.zeros(width, dtype=np.float32)
+    x = np.arange(width)
+
+    fig, (ax_mel, ax_vad) = plt.subplots(
+        2, 1, figsize=(12, 5),
+        gridspec_kw={'height_ratios': [5, 1]},
+        sharex=True
+    )
+    fig.tight_layout(pad=2.0)
+
+    im_mel = ax_mel.imshow(
+        mel_buf, aspect='auto', origin='lower',
+        interpolation='nearest', cmap='turbo',
+        vmin=-2.0, vmax=2.0
+    )
+    ax_mel.set_ylabel('Mel bin')
+    ax_mel.set_title('SOF Mel Spectrogram (compress PCM) — DSP VAD')
+    fig.colorbar(im_mel, ax=ax_mel, fraction=0.02, pad=0.02)
+
+    line_vad, = ax_vad.plot(
+        x, vad_buf, color='green', linewidth=1.5,
+        drawstyle='steps-post')
+    ax_vad.set_ylabel('VAD')
+    ax_vad.set_xlabel('Frame')
+    ax_vad.set_ylim(-0.1, 1.1)
+    ax_vad.set_yticks([0, 1])
+    ax_vad.set_yticklabels(['Silent', 'Speech'])
+
+    plt.ion()
+    plt.show(block=False)
+    fig.canvas.draw()
+    fig.canvas.flush_events()
+
+    # Start crecord capture
+    crecord_cmd = [
+        'crecord', '-v',
+        '-c', str(card),
+        '-d', str(device),
+        '-I', 'BESPOKE',
+        '-R', '16000',
+        '-C', '2',
+        '-F', 'S32_LE',
+    ]
+    cmd = ['stdbuf', '-o0'] + crecord_cmd
+
+    print(f"Starting compress capture: {' '.join(crecord_cmd)}")
+    print(f"Spectrogram width: {width} frames ({width * SOF_HOP_SEC:.1f}s)")
+    print()
+
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+                            bufsize=0)
+
+    frame_q = queue.Queue()
+
+    def reader_thread():
+        buf = bytearray()
+        raw_fd = proc.stdout.fileno()
+        try:
+            while True:
+                data = os.read(raw_fd, SOF_FRAME_BYTES * 4)
+                if not data:
+                    break
+                buf.extend(data)
+                while True:
+                    frame_number, vad_flag, frame_ints = parse_frame(buf)
+                    if frame_ints is None:
+                        break
+                    frame_q.put((frame_number, vad_flag, frame_ints))
+        except (OSError, ValueError):
+            pass
+        frame_q.put(None)
+
+    reader = threading.Thread(target=reader_thread, daemon=True)
+    reader.start()
+
+    recv_frames = 0
+    prev_speech = None
+
+    try:
+        while True:
+            try:
+                item = frame_q.get(timeout=0.05)
+            except queue.Empty:
+                fig.canvas.flush_events()
+                continue
+
+            if item is None:
+                stderr_out = proc.stderr.read().decode(errors='replace')
+                rc = proc.wait()
+                print(f"\ncrecord exited with code {rc}")
+                if stderr_out:
+                    print(f"stderr: {stderr_out}")
+                break
+
+            frame_number, vad_flag, frame_ints = item
+            recv_frames += 1
+            mel = decode_mel_frame(frame_ints)
+            speech = vad_flag != 0
+
+            if speech != prev_speech:
+                t = frame_number * SOF_HOP_SEC
+                tag = "SPEECH" if speech else "SILENCE"
+                print(f"  [{t:7.2f}s] {tag} (hop {frame_number})", flush=True)
+            prev_speech = speech
+
+            # Scroll left and append new frame
+            mel_buf[:, :-1] = mel_buf[:, 1:]
+            mel_buf[:, -1] = mel
+            vad_buf[:-1] = vad_buf[1:]
+            vad_buf[-1] = 1.0 if speech else 0.0
+
+            # Batch update: refresh plot every few frames to reduce overhead
+            if recv_frames % 3 == 0 or not speech:
+                im_mel.set_data(mel_buf)
+                line_vad.set_ydata(vad_buf)
+                fig.canvas.draw_idle()
+                fig.canvas.flush_events()
+
+    except (KeyboardInterrupt, BrokenPipeError):
+        pass
+    finally:
+        if proc.poll() is None:
+            proc.terminate()
+            try:
+                proc.wait(timeout=3)
+            except subprocess.TimeoutExpired:
+                proc.kill()
+                proc.wait()
+        print(f"\nCapture stopped. Received {recv_frames} frames.")
+        try:
+            plt.close(fig)
+        except Exception:
+            pass
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Live scrolling mel spectrogram from SOF compress PCM capture")
+    parser.add_argument('--card', '-c', type=int, default=0,
+                        help='ALSA card number (default: 0)')
+    parser.add_argument('--device', '-d', type=int, default=54,
+                        help='ALSA compress device number (default: 54)')
+    parser.add_argument('--width', '-w', type=int, default=SPECTROGRAM_WIDTH,
+                        help=f'Spectrogram width in frames (default: {SPECTROGRAM_WIDTH})')
+    args = parser.parse_args()
+
+    print("=== SOF Mel Spectrogram Viewer (Compress PCM) ===\n")
+    run_spectrogram(args.card, args.device, args.width)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/audio/mfcc/tune/sof_mel_to_text_live_compress.py b/src/audio/mfcc/tune/sof_mel_to_text_live_compress.py
new file mode 100644
index 000000000000..66e018a29803
--- /dev/null
+++ b/src/audio/mfcc/tune/sof_mel_to_text_live_compress.py
@@ -0,0 +1,482 @@
+"""Live SOF mel capture from compress PCM with DTX-aware Whisper transcription.
+
+Captures mel frames from ALSA compress device (crecord) with embedded VAD flag.
+Frame format: [magic(int32), frame_number(uint32), reserved(int32),
+               energy(int32), noise_energy(int32), vad_flag(int32),
+               mel[0..79](int32)]
+
+With DTX enabled, the DSP sends a configurable number of trailing silence
+frames (e.g. 20 = 200ms) after each speech-to-silence transition, then
+suppresses further silence. This gives the host enough silence to detect
+end-of-speech via a wall-clock patience timer.
+
+Usage:
+    python sof_mel_to_text_live_compress.py [--card 0] [--device 48] [--model whisper-medium-int4-ov]
+"""
+
+import argparse
+import os
+import queue
+import struct
+import subprocess
+import threading
+import time
+import numpy as np
+import openvino as ov
+import huggingface_hub as hf_hub
+from pathlib import Path
+
+# SOF compress mel frame format constants (with DSP data header)
+SOF_MAGIC_BYTES = struct.pack('<i', 0x6D666363)  # ASCII 'mfcc' as int32
+SOF_NUM_HEADER = 6            # magic, frame_number, reserved, energy, noise_energy, vad_flag
+SOF_Q_FORMAT = 23            # Q9.23 fixed-point
+SOF_NUM_MEL = 80
+SOF_FRAME_INTS = SOF_NUM_HEADER + SOF_NUM_MEL  # 86 int32 per frame
+SOF_FRAME_BYTES = SOF_FRAME_INTS * 4  # 344 bytes per frame
+SOF_HOP_SEC = 0.01           # 10 ms per STFT hop
+
+# Speech buffering
+SILENCE_PATIENCE_S = 1.0     # seconds of silence patience before triggering
+MIN_SPEECH_MS = 500          # minimum speech duration to send to Whisper
+MIN_SPEECH_FRAMES = MIN_SPEECH_MS // 10  # 50 frames at 10ms/frame
+MAX_SPEECH_MS = 60000        # max speech buffer before forced transcription
+MAX_SPEECH_FRAMES = MAX_SPEECH_MS // 10  # 6000 frames at 10ms/frame
+
+# Whisper model constants
+WHISPER_FEATURE_SIZE = 80
+WHISPER_NB_MAX_FRAMES = 3000  # 30 seconds at 10ms per frame
+
+
+def decode_mel_frame(raw_ints):
+    """Convert 80 int32 Q9.23 values to float32 mel coefficients."""
+    return raw_ints.astype(np.float32) / (2 ** SOF_Q_FORMAT)
+
+
+# ---------- Whisper inference ----------
+
+class WhisperTranscriber:
+    """Whisper encoder+decoder using OpenVINO, runs in a background thread."""
+
+    def __init__(self, model_path, encoder_device="NPU", decoder_device="CPU"):
+        self.model_path = model_path
+        core = ov.Core()
+        encoder_xml = str(Path(model_path) / "openvino_encoder_model.xml")
+        decoder_xml = str(Path(model_path) / "openvino_decoder_model.xml")
+        # NPU requires static shapes — fix [?,?,3000] to [1,80,3000]
+        encoder_model = core.read_model(encoder_xml)
+        encoder_model.reshape({0: [1, WHISPER_FEATURE_SIZE, WHISPER_NB_MAX_FRAMES]})
+        self.encoder = core.compile_model(encoder_model, encoder_device)
+        self.decoder = core.compile_model(decoder_xml, decoder_device)
+        self._load_tokenizer()
+        self._busy = False
+        self._lock = threading.Lock()
+
+    def _load_tokenizer(self):
+        """Load Whisper tokenizer."""
+        try:
+            from transformers import WhisperTokenizer
+            self.tokenizer = WhisperTokenizer.from_pretrained(self.model_path)
+            self._tokenizer_type = "hf"
+        except ImportError:
+            import openvino_genai as ov_genai
+            self.tokenizer = ov_genai.Tokenizer(self.model_path)
+            self._tokenizer_type = "ov"
+
+    def is_busy(self):
+        with self._lock:
+            return self._busy
+
+    def transcribe_async(self, mel_frames, callback):
+        """Run transcription in a background thread.
+
+        Args:
+            mel_frames: list of np.ndarray [80] mel frames
+            callback: function(text) called with result
+        """
+        with self._lock:
+            if self._busy:
+                return False
+            self._busy = True
+
+        t = threading.Thread(target=self._run, args=(mel_frames, callback),
+                             daemon=True)
+        t.start()
+        return True
+
+    def _run(self, mel_frames, callback):
+        try:
+            text = self._transcribe(mel_frames)
+            callback(text)
+        except Exception as e:
+            print(f"  [Whisper ERROR] {e}", flush=True)
+        finally:
+            with self._lock:
+                self._busy = False
+
+    def _transcribe(self, mel_frames):
+        """Encode mel frames and decode to text."""
+        n_frames = len(mel_frames)
+        if n_frames == 0:
+            return ""
+
+        # Stack frames into [80, n_frames]
+        features = np.column_stack(mel_frames).astype(np.float32)
+
+        # Pad to 3000 frames
+        silence_val = features.min()
+        padded = np.full((WHISPER_FEATURE_SIZE, WHISPER_NB_MAX_FRAMES),
+                         silence_val, dtype=np.float32)
+        n = min(n_frames, WHISPER_NB_MAX_FRAMES)
+        padded[:, :n] = features[:, :n]
+
+        # Encoder
+        t0 = time.time()
+        encoder_input = padded.reshape(1, WHISPER_FEATURE_SIZE, WHISPER_NB_MAX_FRAMES)
+        encoder_req = self.encoder.create_infer_request()
+        encoder_req.set_tensor("input_features", ov.Tensor(encoder_input))
+        encoder_req.infer()
+        hidden_state = encoder_req.get_tensor("last_hidden_state").data.copy()
+        t1 = time.time()
+        print(f"  [Whisper] encoder: {t1-t0:.2f}s", flush=True)
+
+        # Decoder: greedy decode
+        token_ids = self._greedy_decode(hidden_state)
+        t2 = time.time()
+        print(f"  [Whisper] decoder: {t2-t1:.2f}s ({len(token_ids)} tokens)",
+              flush=True)
+
+        # Convert to text
+        text_tokens = [t for t in token_ids if t < 50257]
+        text = self.tokenizer.decode(text_tokens)
+
+        return text.strip()
+
+    def _greedy_decode(self, hidden_state, max_tokens=448):
+        """Greedy decoding loop."""
+        sot_tokens = [50258, 50259, 50359, 50363]
+        eos_token = 50257
+
+        decoder_req = self.decoder.create_infer_request()
+        input_names = [inp.get_any_name() for inp in self.decoder.inputs]
+        has_cache_position = "cache_position" in input_names
+
+        decoder_req.set_tensor("encoder_hidden_states", ov.Tensor(hidden_state))
+
+        # Prefill with SOT tokens
+        input_ids = np.array([sot_tokens], dtype=np.int64)
+        beam_idx = np.array([0], dtype=np.int32)
+
+        decoder_req.set_tensor("input_ids", ov.Tensor(input_ids))
+        if "beam_idx" in input_names:
+            decoder_req.set_tensor("beam_idx", ov.Tensor(beam_idx))
+        if has_cache_position:
+            cache_pos = np.arange(len(sot_tokens), dtype=np.int64).reshape(1, -1)
+            decoder_req.set_tensor("cache_position", ov.Tensor(cache_pos))
+
+        decoder_req.infer()
+        logits = decoder_req.get_tensor("logits").data
+        next_token = int(np.argmax(logits[0, -1, :]))
+
+        generated = [next_token]
+        position = len(sot_tokens)
+
+        for _ in range(max_tokens - 1):
+            if next_token == eos_token:
+                break
+
+            decoder_req.set_tensor("input_ids",
+                                   ov.Tensor(np.array([[next_token]], dtype=np.int64)))
+            if "beam_idx" in input_names:
+                decoder_req.set_tensor("beam_idx", ov.Tensor(beam_idx))
+            if has_cache_position:
+                decoder_req.set_tensor("cache_position",
+                                       ov.Tensor(np.array([[position]], dtype=np.int64)))
+
+            decoder_req.infer()
+            logits = decoder_req.get_tensor("logits").data
+            next_token = int(np.argmax(logits[0, -1, :]))
+            generated.append(next_token)
+            position += 1
+
+        return generated
+
+
+# ---------- Frame parser ----------
+
+def parse_frame(buf):
+    """Parse one complete mel frame from a bytearray buffer.
+
+    Frame layout: [magic(4B), frame_number(4B), reserved(4B), energy(4B),
+                   noise_energy(4B), vad_flag(4B), mel[0..79](320B)] = 344 bytes
+
+    In compress PCM mode, each read delivers complete frames with no zero
+    padding, so we search for magic and extract.
+
+    Mutates buf in-place (deletes consumed bytes).
+    Returns: (frame_number, vad_flag, mel_ints) or (None, None, None)
+    """
+    idx = buf.find(SOF_MAGIC_BYTES)
+    if idx < 0:
+        if len(buf) > 3:
+            del buf[:-3]
+        return None, None, None
+    end = idx + SOF_FRAME_BYTES
+    if end > len(buf):
+        del buf[:idx]
+        return None, None, None
+
+    # Parse header fields
+    frame_number = struct.unpack_from('<I', buf, idx + 4)[0]
+    vad_flag = struct.unpack_from('<i', buf, idx + 20)[0]
+
+    # Parse 80 mel coefficients (after 24-byte header)
+    mel_bytes = bytes(buf[idx + SOF_NUM_HEADER * 4:end])
+    mel_ints = np.frombuffer(mel_bytes, dtype=np.int32)
+    del buf[:end]
+    return frame_number, vad_flag, mel_ints
+
+
+# ---------- Main capture + transcription loop ----------
+
+def try_transcribe(transcriber, speech_buffer, t, on_transcription):
+    """Attempt to send speech buffer to Whisper. Returns True if sent or discarded."""
+    n = len(speech_buffer)
+    duration = n * SOF_HOP_SEC
+
+    if n < MIN_SPEECH_FRAMES:
+        print(f"  [{t:7.2f}s] Too short ({duration:.1f}s), "
+              f"discarding {n} frames", flush=True)
+        return True
+
+    if not transcriber.is_busy():
+        print(f"  [{t:7.2f}s] Transcribing {n} frames "
+              f"({duration:.1f}s)...", flush=True)
+        frames_copy = list(speech_buffer)
+        transcriber.transcribe_async(frames_copy, on_transcription)
+        return True
+
+    print(f"  [{t:7.2f}s] (Whisper busy, queuing {n} frames)", flush=True)
+    return False
+
+
+def run_capture(card, device, model_path, encoder_device, decoder_device,
+                patience=SILENCE_PATIENCE_S):
+    """Main capture loop: crecord compress PCM → DSP VAD → buffer speech → Whisper.
+
+    With DTX, the FW sends:
+    - All VAD=1 (speech) frames
+    - Trailing VAD=0 silence frames (e.g. 20 = 200ms) after speech ends
+
+    A wall-clock patience timer triggers transcription after silence.
+    If speech resumes within the patience window, buffering continues.
+    """
+
+    transcriber = WhisperTranscriber(model_path, encoder_device=encoder_device,
+                                     decoder_device=decoder_device)
+
+    crecord_cmd = [
+        'crecord', '-v',
+        '-c', str(card),
+        '-d', str(device),
+        '-I', 'BESPOKE',
+        '-R', '16000',
+        '-C', '2',
+        '-F', 'S32_LE',
+    ]
+
+    # Wrap with stdbuf to disable crecord's stdio buffering. When stdout
+    # is a pipe, C stdio uses full buffering (~4-8KB). A single DTX
+    # silence frame (344 bytes) would sit in crecord's buffer until enough
+    # data accumulates, delaying the patience timer by many seconds.
+    cmd = ['stdbuf', '-o0'] + crecord_cmd
+
+    print(f"Starting compress capture: {' '.join(crecord_cmd)}")
+    print(f"VAD source: DSP (embedded in stream, DTX mode)")
+    print(f"Silence patience: {patience}s")
+    print(f"Whisper model: {model_path} (encoder: {encoder_device}, decoder: {decoder_device})")
+    print()
+
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+                            bufsize=0)
+
+    # Reader thread feeds parsed frames into a queue to decouple pipe I/O
+    # from the patience timer in the main thread.
+    frame_q = queue.Queue()
+
+    def reader_thread():
+        buf = bytearray()
+        raw_fd = proc.stdout.fileno()
+        try:
+            while True:
+                # Use os.read() for unbuffered reads — returns immediately
+                # when any data is available. Python's read(n) waits for
+                # exactly n bytes, which delays SILENCE frames until the
+                # next speech burst fills the buffer.
+                data = os.read(raw_fd, SOF_FRAME_BYTES * 4)
+                if not data:
+                    break
+                buf.extend(data)
+                while True:
+                    frame_number, vad_flag, frame_ints = parse_frame(buf)
+                    if frame_ints is None:
+                        break
+                    frame_q.put((frame_number, vad_flag, frame_ints))
+        except (OSError, ValueError):
+            pass
+        frame_q.put(None)  # sentinel
+
+    reader = threading.Thread(target=reader_thread, daemon=True)
+    reader.start()
+
+    recv_frames = 0
+    prev_speech = None
+    last_hop = 0
+
+    # Speech buffering state
+    speech_buffer = []         # list of mel frames during speech
+    silence_time = None        # wall-clock time when first VAD=0 arrived
+    pending_queue = None       # queued frames waiting for Whisper to become free
+    pending_t = 0.0            # timestamp for queued frames
+
+    def on_transcription(text):
+        if text:
+            print(f"\n  >> \"{text}\"\n", flush=True)
+        else:
+            print("  [Whisper] empty result", flush=True)
+
+    def flush_speech(t_now):
+        """Flush speech buffer to Whisper."""
+        nonlocal speech_buffer, silence_time, pending_queue, pending_t
+        if not speech_buffer:
+            silence_time = None
+            return
+        if not try_transcribe(transcriber, speech_buffer, t_now,
+                              on_transcription):
+            pending_queue = list(speech_buffer)
+            pending_t = t_now
+        speech_buffer.clear()
+        silence_time = None
+
+    try:
+        while True:
+            # Calculate queue timeout based on patience timer
+            get_timeout = 0.1  # default polling interval
+            if silence_time is not None:
+                remaining = patience - (time.monotonic() - silence_time)
+                get_timeout = max(remaining, 0.01)
+
+            try:
+                item = frame_q.get(timeout=get_timeout)
+            except queue.Empty:
+                # Patience expired — flush speech to Whisper
+                if silence_time is not None:
+                    elapsed = time.monotonic() - silence_time
+                    if elapsed >= patience:
+                        t = last_hop * SOF_HOP_SEC
+                        flush_speech(t)
+
+                # Drain pending queue when Whisper becomes free
+                if pending_queue is not None and not transcriber.is_busy():
+                    print(f"  [{pending_t:7.2f}s] Whisper free, sending "
+                          f"{len(pending_queue)} queued frames", flush=True)
+                    transcriber.transcribe_async(pending_queue, on_transcription)
+                    pending_queue = None
+                continue
+
+            if item is None:
+                # Reader thread ended (crecord exited)
+                stderr_out = proc.stderr.read().decode(errors='replace')
+                rc = proc.wait()
+                print(f"\ncrecord exited with code {rc}")
+                if stderr_out:
+                    print(f"stderr: {stderr_out}")
+                break
+
+            frame_number, vad_flag, frame_ints = item
+            recv_frames += 1
+            last_hop = frame_number
+            mel = decode_mel_frame(frame_ints)
+            speech = vad_flag != 0
+            t = frame_number * SOF_HOP_SEC
+
+            # Print VAD transitions
+            if speech != prev_speech:
+                tag = "SPEECH" if speech else "SILENCE"
+                print(f"  [{t:7.2f}s] {tag} (hop {frame_number}, "
+                      f"received {recv_frames})", flush=True)
+            prev_speech = speech
+
+            # Drain pending queue when Whisper becomes free
+            if pending_queue is not None and not transcriber.is_busy():
+                print(f"  [{pending_t:7.2f}s] Whisper free, sending "
+                      f"{len(pending_queue)} queued frames", flush=True)
+                transcriber.transcribe_async(pending_queue, on_transcription)
+                pending_queue = None
+
+            # --- Speech buffering logic ---
+            if speech:
+                if len(speech_buffer) >= MAX_SPEECH_FRAMES:
+                    n = len(speech_buffer)
+                    duration = n * SOF_HOP_SEC
+                    print(f"  [{t:7.2f}s] Buffer full ({duration:.1f}s), "
+                          f"forcing transcription", flush=True)
+                    flush_speech(t)
+
+                speech_buffer.append(mel.copy())
+                silence_time = None  # speech resumed, cancel patience timer
+
+            else:
+                # VAD=0: start patience timer if we have buffered speech.
+                # Don't refresh if already running so trailing silence
+                # frames don't extend the wait.
+                if speech_buffer and silence_time is None:
+                    silence_time = time.monotonic()
+
+    except (KeyboardInterrupt, BrokenPipeError):
+        pass
+    finally:
+        # Flush remaining speech
+        if speech_buffer:
+            t = last_hop * SOF_HOP_SEC
+            flush_speech(t)
+        if proc.poll() is None:
+            proc.terminate()
+            try:
+                proc.wait(timeout=3)
+            except subprocess.TimeoutExpired:
+                proc.kill()
+                proc.wait()
+        print(f"\n\nCapture stopped. Received {recv_frames} frames.")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Live SOF mel capture from compress PCM with DTX-aware "
+                    "Whisper transcription")
+    parser.add_argument('--card', '-c', type=int, default=0,
+                        help='ALSA card number (default: 0)')
+    parser.add_argument('--device', '-d', type=int, default=54,
+                        help='ALSA compress device number (default: 54)')
+    parser.add_argument('--model', '-m', default='whisper-medium-int4-ov',
+                        help='Path to Whisper OpenVINO model directory')
+    parser.add_argument('--encoder-device', default='NPU',
+                        help='OpenVINO device for encoder (default: NPU)')
+    parser.add_argument('--decoder-device', default='CPU',
+                        help='OpenVINO device for decoder (default: CPU)')
+    parser.add_argument('--patience', type=float, default=SILENCE_PATIENCE_S,
+                        help=f'Seconds of silence patience before triggering '
+                             f'transcription (default: {SILENCE_PATIENCE_S})')
+    args = parser.parse_args()
+
+    model_id = "OpenVINO/" + os.path.basename(args.model)
+    if not os.path.isdir(args.model):
+        print(f"Downloading model {model_id} ...")
+        hf_hub.snapshot_download(model_id, local_dir=args.model)
+
+    print("=== Live SOF Mel → Whisper Transcription (Compress PCM, DTX) ===\n")
+    run_capture(args.card, args.device, args.model, args.encoder_device,
+                args.decoder_device, patience=args.patience)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/audio/mfcc/tune/sof_mel_to_text_live_dsp_vad.py b/src/audio/mfcc/tune/sof_mel_to_text_live_dsp_vad.py
new file mode 100644
index 000000000000..9171df2e3cec
--- /dev/null
+++ b/src/audio/mfcc/tune/sof_mel_to_text_live_dsp_vad.py
@@ -0,0 +1,384 @@
+"""Live SOF mel capture with DSP VAD-triggered Whisper transcription.
+
+Captures mel frames from ALSA with embedded VAD flag from the DSP.
+Frame format: [magic(int32), frame_number(uint32), reserved(int32), energy(int32), noise_energy(int32), vad_flag(int32), mel[0..79](int32)]
+When silence of 100ms is detected after speech, sends the buffered mel
+features to Whisper (OpenVINO encoder+decoder) for transcription.
+Capture continues running during Whisper inference.
+
+Usage:
+    python sof_mel_to_text_live_dsp_vad.py [--device hw:0,47] [--model whisper-medium-int4-ov]
+"""
+
+import argparse
+import os
+import struct
+import subprocess
+import threading
+import time
+import numpy as np
+import openvino as ov
+import huggingface_hub as hf_hub
+from pathlib import Path
+
+# SOF mel_s32.raw format constants (with DSP data header)
+SOF_MAGIC_BYTES = struct.pack('<i', 0x6D666363)  # ASCII 'mfcc' as int32
+SOF_NUM_HEADER = 6            # magic, frame_number, reserved, energy, noise_energy, vad_flag
+SOF_Q_FORMAT = 23            # Q9.23 fixed-point
+SOF_NUM_MEL = 80
+SOF_FRAME_INTS = SOF_NUM_HEADER + SOF_NUM_MEL  # 86 int32 per frame
+SOF_FRAME_BYTES = SOF_FRAME_INTS * 4  # 344 bytes per frame
+
+# Speech buffering
+SILENCE_TRIGGER_MS = 100     # ms of silence after speech to trigger transcription
+SILENCE_TRIGGER_FRAMES = SILENCE_TRIGGER_MS // 10  # 10 frames at 10ms/frame
+MIN_SPEECH_MS = 500          # minimum speech duration to send to Whisper
+MIN_SPEECH_FRAMES = MIN_SPEECH_MS // 10  # 50 frames at 10ms/frame
+MAX_SPEECH_MS = 60000        # max speech buffer before forced transcription
+MAX_SPEECH_FRAMES = MAX_SPEECH_MS // 10  # 6000 frames at 10ms/frame
+
+# Whisper model constants
+WHISPER_FEATURE_SIZE = 80
+WHISPER_NB_MAX_FRAMES = 3000  # 30 seconds at 10ms per frame
+
+
+def decode_mel_frame(raw_ints):
+    """Convert 80 int32 Q9.23 values to float32 mel coefficients."""
+    return raw_ints.astype(np.float32) / (2 ** SOF_Q_FORMAT)
+
+
+# ---------- Whisper inference ----------
+
+class WhisperTranscriber:
+    """Whisper encoder+decoder using OpenVINO, runs in a background thread."""
+
+    def __init__(self, model_path, encoder_device="NPU", decoder_device="CPU"):
+        self.model_path = model_path
+        core = ov.Core()
+        encoder_xml = str(Path(model_path) / "openvino_encoder_model.xml")
+        decoder_xml = str(Path(model_path) / "openvino_decoder_model.xml")
+        # NPU requires static shapes — fix [?,?,3000] to [1,80,3000]
+        encoder_model = core.read_model(encoder_xml)
+        encoder_model.reshape({0: [1, WHISPER_FEATURE_SIZE, WHISPER_NB_MAX_FRAMES]})
+        self.encoder = core.compile_model(encoder_model, encoder_device)
+        self.decoder = core.compile_model(decoder_xml, decoder_device)
+        self._load_tokenizer()
+        self._busy = False
+        self._lock = threading.Lock()
+
+    def _load_tokenizer(self):
+        """Load Whisper tokenizer."""
+        try:
+            from transformers import WhisperTokenizer
+            self.tokenizer = WhisperTokenizer.from_pretrained(self.model_path)
+            self._tokenizer_type = "hf"
+        except ImportError:
+            import openvino_genai as ov_genai
+            self.tokenizer = ov_genai.Tokenizer(self.model_path)
+            self._tokenizer_type = "ov"
+
+    def is_busy(self):
+        with self._lock:
+            return self._busy
+
+    def transcribe_async(self, mel_frames, callback):
+        """Run transcription in a background thread.
+
+        Args:
+            mel_frames: list of np.ndarray [80] mel frames
+            callback: function(text) called with result
+        """
+        with self._lock:
+            if self._busy:
+                return False
+            self._busy = True
+
+        t = threading.Thread(target=self._run, args=(mel_frames, callback),
+                             daemon=True)
+        t.start()
+        return True
+
+    def _run(self, mel_frames, callback):
+        try:
+            text = self._transcribe(mel_frames)
+            callback(text)
+        except Exception as e:
+            print(f"  [Whisper ERROR] {e}", flush=True)
+        finally:
+            with self._lock:
+                self._busy = False
+
+    def _transcribe(self, mel_frames):
+        """Encode mel frames and decode to text."""
+        n_frames = len(mel_frames)
+        if n_frames == 0:
+            return ""
+
+        # Stack frames into [80, n_frames]
+        features = np.column_stack(mel_frames).astype(np.float32)
+
+        # Pad to 3000 frames
+        silence_val = features.min()
+        padded = np.full((WHISPER_FEATURE_SIZE, WHISPER_NB_MAX_FRAMES),
+                         silence_val, dtype=np.float32)
+        n = min(n_frames, WHISPER_NB_MAX_FRAMES)
+        padded[:, :n] = features[:, :n]
+
+        # Encoder
+        t0 = time.time()
+        encoder_input = padded.reshape(1, WHISPER_FEATURE_SIZE, WHISPER_NB_MAX_FRAMES)
+        encoder_req = self.encoder.create_infer_request()
+        encoder_req.set_tensor("input_features", ov.Tensor(encoder_input))
+        encoder_req.infer()
+        hidden_state = encoder_req.get_tensor("last_hidden_state").data.copy()
+        t1 = time.time()
+        print(f"  [Whisper] encoder: {t1-t0:.2f}s", flush=True)
+
+        # Decoder: greedy decode
+        token_ids = self._greedy_decode(hidden_state)
+        t2 = time.time()
+        print(f"  [Whisper] decoder: {t2-t1:.2f}s ({len(token_ids)} tokens)",
+              flush=True)
+
+        # Convert to text
+        text_tokens = [t for t in token_ids if t < 50257]
+        text = self.tokenizer.decode(text_tokens)
+
+        return text.strip()
+
+    def _greedy_decode(self, hidden_state, max_tokens=448):
+        """Greedy decoding loop."""
+        sot_tokens = [50258, 50259, 50359, 50363]
+        eos_token = 50257
+
+        decoder_req = self.decoder.create_infer_request()
+        input_names = [inp.get_any_name() for inp in self.decoder.inputs]
+        has_cache_position = "cache_position" in input_names
+
+        decoder_req.set_tensor("encoder_hidden_states", ov.Tensor(hidden_state))
+
+        # Prefill with SOT tokens
+        input_ids = np.array([sot_tokens], dtype=np.int64)
+        beam_idx = np.array([0], dtype=np.int32)
+
+        decoder_req.set_tensor("input_ids", ov.Tensor(input_ids))
+        if "beam_idx" in input_names:
+            decoder_req.set_tensor("beam_idx", ov.Tensor(beam_idx))
+        if has_cache_position:
+            cache_pos = np.arange(len(sot_tokens), dtype=np.int64).reshape(1, -1)
+            decoder_req.set_tensor("cache_position", ov.Tensor(cache_pos))
+
+        decoder_req.infer()
+        logits = decoder_req.get_tensor("logits").data
+        next_token = int(np.argmax(logits[0, -1, :]))
+
+        generated = [next_token]
+        position = len(sot_tokens)
+
+        for _ in range(max_tokens - 1):
+            if next_token == eos_token:
+                break
+
+            decoder_req.set_tensor("input_ids",
+                                   ov.Tensor(np.array([[next_token]], dtype=np.int64)))
+            if "beam_idx" in input_names:
+                decoder_req.set_tensor("beam_idx", ov.Tensor(beam_idx))
+            if has_cache_position:
+                decoder_req.set_tensor("cache_position",
+                                       ov.Tensor(np.array([[position]], dtype=np.int64)))
+
+            decoder_req.infer()
+            logits = decoder_req.get_tensor("logits").data
+            next_token = int(np.argmax(logits[0, -1, :]))
+            generated.append(next_token)
+            position += 1
+
+        return generated
+
+
+# ---------- Frame parser ----------
+
+def find_frame_in_buffer(buf):
+    """Find the first complete mel frame with data header in a bytearray.
+
+    Frame layout: [magic(4B), frame_number(4B), reserved(4B), energy(4B),
+                   noise_energy(4B), vad_flag(4B), mel[0..79](320B)] = 344 bytes
+    Mutates buf in-place (deletes consumed bytes).
+    Returns: (vad_flag, mel_ints) or (None, None)
+    """
+    idx = buf.find(SOF_MAGIC_BYTES)
+    if idx < 0:
+        if len(buf) > 3:
+            del buf[:-3]
+        return None, None
+    end = idx + SOF_FRAME_BYTES
+    if end > len(buf):
+        del buf[:idx]
+        return None, None
+    # Parse vad_flag at offset 20 (after magic + frame_number + reserved + energy + noise_energy)
+    vad_flag = struct.unpack_from('<i', buf, idx + 20)[0]
+    # Parse 80 mel coefficients (after 24-byte header)
+    mel_bytes = bytes(buf[idx + SOF_NUM_HEADER * 4 : end])
+    mel_ints = np.frombuffer(mel_bytes, dtype=np.int32)
+    del buf[:end]
+    return vad_flag, mel_ints
+
+
+# ---------- Main capture + transcription loop ----------
+
+def run_capture(device, rate, model_path, encoder_device, decoder_device):
+    """Main capture loop: ALSA → DSP VAD → buffer speech → Whisper."""
+
+    transcriber = WhisperTranscriber(model_path, encoder_device=encoder_device,
+                                     decoder_device=decoder_device)
+
+    cmd = [
+        'arecord', '-D', device, '-f', 'S32_LE', '-c', '2',
+        '-r', str(rate), '-t', 'raw', '--buffer-size', '8192',
+    ]
+
+    print(f"Starting capture: {' '.join(cmd)}")
+    print(f"VAD source: DSP (embedded in stream)")
+    print(f"Silence trigger: {SILENCE_TRIGGER_MS}ms ({SILENCE_TRIGGER_FRAMES} frames)")
+    print(f"Whisper model: {model_path} (encoder: {encoder_device}, decoder: {decoder_device})")
+    print()
+
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+    buf = bytearray()
+    read_chunk = SOF_FRAME_BYTES * 4
+    frame_num = 0
+    prev_speech = None
+
+    # Speech buffering state
+    speech_buffer = []         # list of mel frames during speech
+    silence_counter = 0        # consecutive silence frames after speech
+    was_speaking = False       # True if we have buffered speech frames
+
+    def on_transcription(text):
+        if text:
+            print(f"\n  >> \"{text}\"\n", flush=True)
+        else:
+            print("  [Whisper] empty result", flush=True)
+
+    try:
+        while True:
+            data = proc.stdout.read(read_chunk)
+            if not data:
+                rc = proc.poll()
+                if rc is not None:
+                    stderr_out = proc.stderr.read().decode(errors='replace')
+                    print(f"\narecord exited with code {rc}")
+                    if stderr_out:
+                        print(f"stderr: {stderr_out}")
+                    break
+                continue
+
+            buf.extend(data)
+
+            while True:
+                vad_flag, frame_ints = find_frame_in_buffer(buf)
+                if frame_ints is None:
+                    break
+
+                frame_num += 1
+                mel = decode_mel_frame(frame_ints)
+                speech = vad_flag != 0
+
+                # Print VAD transitions
+                if speech != prev_speech:
+                    t = frame_num * 0.01
+                    tag = "SPEECH" if speech else "SILENCE"
+                    print(f"  [{t:7.2f}s] {tag}", flush=True)
+                prev_speech = speech
+
+                # --- Speech buffering logic ---
+                if speech:
+                    if len(speech_buffer) >= MAX_SPEECH_FRAMES:
+                        n = len(speech_buffer)
+                        duration = n * 0.01
+                        t = frame_num * 0.01
+                        print(f"  [{t:7.2f}s] Buffer full ({duration:.1f}s), "
+                              f"forcing transcription of {n} frames",
+                              flush=True)
+                        if not transcriber.is_busy():
+                            frames_copy = list(speech_buffer)
+                            transcriber.transcribe_async(
+                                frames_copy, on_transcription)
+                        else:
+                            print(f"  [{t:7.2f}s] (Whisper busy, "
+                                  f"dropping {n} frames)", flush=True)
+                        speech_buffer.clear()
+                    speech_buffer.append(mel.copy())
+                    silence_counter = 0
+                    was_speaking = True
+                else:
+                    if was_speaking:
+                        silence_counter += 1
+                        if silence_counter >= SILENCE_TRIGGER_FRAMES:
+                            n = len(speech_buffer)
+                            duration = n * 0.01
+                            t = frame_num * 0.01
+
+                            if n < MIN_SPEECH_FRAMES:
+                                # Too short — discard
+                                speech_buffer.clear()
+                                silence_counter = 0
+                                was_speaking = False
+                                continue
+
+                            # Silence threshold reached — send to Whisper
+                            print(f"  [{t:7.2f}s] Transcribing {n} frames "
+                                  f"({duration:.1f}s)...", flush=True)
+
+                            if not transcriber.is_busy():
+                                frames_copy = list(speech_buffer)
+                                transcriber.transcribe_async(
+                                    frames_copy, on_transcription)
+                            else:
+                                print(f"  [{t:7.2f}s] (Whisper busy, "
+                                      f"dropping {n} frames)", flush=True)
+
+                            speech_buffer.clear()
+                            silence_counter = 0
+                            was_speaking = False
+
+    except (KeyboardInterrupt, BrokenPipeError):
+        pass
+    finally:
+        if proc.poll() is None:
+            proc.terminate()
+            try:
+                proc.wait(timeout=3)
+            except subprocess.TimeoutExpired:
+                proc.kill()
+                proc.wait()
+        print("\n\nCapture stopped.")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Live SOF mel capture with DSP VAD-triggered Whisper transcription")
+    parser.add_argument('--device', '-D', default='hw:0,47',
+                        help='ALSA capture device (default: hw:0,47)')
+    parser.add_argument('--rate', '-r', type=int, default=16000,
+                        help='Sample rate for arecord (default: 16000)')
+    parser.add_argument('--model', '-m', default='whisper-medium-int4-ov',
+                        help='Path to Whisper OpenVINO model directory')
+    parser.add_argument('--encoder-device', default='NPU',
+                        help='OpenVINO device for encoder (default: NPU)')
+    parser.add_argument('--decoder-device', default='CPU',
+                        help='OpenVINO device for decoder (default: CPU)')
+    args = parser.parse_args()
+    model_id = "OpenVINO/" + os.path.basename(args.model)
+    if not os.path.isdir(args.model):
+        print(f"Downloading model {model_id} ...")
+        hf_hub.snapshot_download(model_id, local_dir=args.model)
+
+    print("=== Live SOF Mel → Whisper Transcription (DSP VAD) ===\n")
+    run_capture(args.device, args.rate, args.model, args.encoder_device,
+                args.decoder_device)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/include/sof/audio/mfcc/mfcc_comp.h b/src/include/sof/audio/mfcc/mfcc_comp.h
index 025eef116752..b380cd84fdf0 100644
--- a/src/include/sof/audio/mfcc/mfcc_comp.h
+++ b/src/include/sof/audio/mfcc/mfcc_comp.h
@@ -9,9 +9,12 @@
 #define __SOF_AUDIO_MFCC_MFCC_COMP_H__
 
 #include <sof/audio/module_adapter/module/generic.h>
+#include <sof/audio/data_blob.h>
 #include <sof/math/auditory.h>
 #include <sof/math/dct.h>
 #include <sof/math/fft.h>
+#include <sof/audio/mfcc/mfcc_vad.h>
+#include <sof/ipc/msg.h>
 #include <stddef.h>
 #include <stdint.h>
 
@@ -31,17 +34,24 @@
 
 #define MFCC_MAGIC 0x6d666363 /* ASCII for "mfcc" */
 #define MFCC_FFT_BITS	32
+#define MFCC_MAX_SAMPLE_RATE 64000 /* Max sample rate in Hz, limited by int16_t Mel scale */
 
-/** \brief Type definition for processing function select return value. */
-typedef void (*mfcc_func)(struct processing_module *mod,
-			  struct input_stream_buffer *bsource,
-			  struct output_stream_buffer *bsink,
-			  int frames);
+/** \brief Switch control index for VAD notification to user space */
+#define MFCC_CTRL_INDEX_VAD	0
 
-/** \brief MFCC processing functions map item. */
-struct mfcc_func_map {
-	uint8_t source;		/**< source frame format */
-	mfcc_func func;		/**< processing function */
+/**
+ * \brief Data header prepended to every MFCC output frame.
+ *
+ * Written before the Mel spectrum or cepstral coefficient data in each
+ * output frame.
+ */
+struct mfcc_data_header {
+	uint32_t magic;		/**< Magic word MFCC_MAGIC (0x6d666363) */
+	uint32_t frame_number;	/**< Frame number, counting calculated frames starting from 0 */
+	int32_t reserved;	/**< Reserved for future use, set to 0 */
+	int32_t energy;		/**< Weighted signal energy in Q9.23 */
+	int32_t noise_energy;	/**< Weighted noise floor energy in Q9.23 */
+	int32_t vad_flag;	/**< VAD decision: 1 = speech, 0 = silence */
 };
 
 struct mfcc_buffer {
@@ -60,6 +70,10 @@ struct mfcc_pre_emph {
 	int enable;
 };
 
+/** \brief Type definition for source/sink based input copy function. */
+typedef void (*mfcc_source_func)(struct sof_source *source, struct mfcc_buffer *buf,
+				 struct mfcc_pre_emph *emph, int frames, int source_channel);
+
 struct mfcc_fft {
 	struct icomplex32 *fft_buf; /**< fft_padded_size */
 	struct icomplex32 *fft_out; /**< fft_padded_size */
@@ -105,20 +119,29 @@ struct mfcc_state {
 	bool mel_only; /**< When true, output Mel spectra instead of cepstral coefficients */
 	bool waiting_fill; /**< booleans */
 	bool prev_samples_valid;
-	bool magic_pending; /**< True when magic word not yet written for current output */
+	bool header_pending; /**< True when data header not yet written for current output */
+	struct mfcc_data_header header; /**< Data header for current output frame */
 	size_t sample_buffers_size; /**< bytes */
-	int16_t *out_data_ptr; /**< Read pointer into scratch data for multi-period output */
-	int32_t *out_data_ptr_32; /**< Read pointer for 32-bit mel-only output */
-	int out_remain; /**< Remaining int16_t samples to write to sink from scratch */
+	int32_t *out_data_ptr; /**< Read pointer into scratch data for multi-period output */
+	int out_remain; /**< Remaining int32_t samples to write to sink from scratch */
+	uint32_t hop_count; /**< FFT hop counter, increments every processed hop */
+	int vad_silence_count; /**< Consecutive VAD=0 hops since last speech */
+	int16_t dtx_trailing_silence; /**< Number of trailing silence hops to send, from config */
+	int16_t dtx_silence_interval; /**< Send silence frame every Nth hop, 0 = disable */
+	int dtx_silence_counter; /**< Counter for periodic silence frame send */
 };
 
 /* MFCC component private data */
 struct mfcc_comp_data {
 	struct mfcc_state state;
+	struct mfcc_vad_state vad;
 	struct comp_data_blob_handler *model_handler;
 	struct sof_mfcc_config *config;
+	struct ipc_msg *msg;		/**< IPC notification for VAD switch control */
 	int max_frames;
-	mfcc_func mfcc_func;		/**< processing function */
+	enum sof_ipc_frame source_format;	/**< Source audio format for output sizing */
+	bool vad_prev;			/**< Previous VAD state for edge detection */
+	mfcc_source_func source_func;	/**< source copy function */
 };
 
 static inline int mfcc_buffer_samples_without_wrap(struct mfcc_buffer *buffer, int16_t *ptr)
@@ -145,31 +168,83 @@ void mfcc_fill_fft_buffer(struct mfcc_state *state);
 
 void mfcc_apply_window(struct mfcc_state *state, int input_shift);
 
-#if CONFIG_FORMAT_S16LE
+/**
+ * \brief Run STFT and Mel/DCT processing.
+ * \return Number of output coefficients produced, or 0 if not enough data.
+ */
+int mfcc_stft_process(struct processing_module *mod, struct mfcc_comp_data *cd);
 
-void mfcc_source_copy_s16(struct input_stream_buffer *bsource, struct mfcc_buffer *buf,
-			  struct mfcc_pre_emph *emph, int frames, int source_channel);
+/**
+ * \brief Prepare and commit MFCC output data after STFT processing.
+ *
+ * This handles the output data conversion and dispatches to either the
+ * compress-output or legacy PCM-output path.
+ *
+ * \return 0 on success or a negative error code.
+ */
+int mfcc_process_output(struct processing_module *mod, struct mfcc_comp_data *cd,
+			 struct sof_source **sources, struct sof_sink **sinks,
+			 int num_ceps, int frames);
 
-void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer *bsource,
-		      struct output_stream_buffer *bsink, int frames);
+#if CONFIG_FORMAT_S16LE
+void mfcc_source_copy_s16(struct sof_source *source, struct mfcc_buffer *buf,
+			  struct mfcc_pre_emph *emph, int frames, int source_channel);
 #endif
 
 #if CONFIG_FORMAT_S24LE
-
-void mfcc_source_copy_s24(struct input_stream_buffer *bsource, struct mfcc_buffer *buf,
+void mfcc_source_copy_s24(struct sof_source *source, struct mfcc_buffer *buf,
 			  struct mfcc_pre_emph *emph, int frames, int source_channel);
-
-void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer *bsource,
-		      struct output_stream_buffer *bsink, int frames);
 #endif
 
 #if CONFIG_FORMAT_S32LE
-
-void mfcc_source_copy_s32(struct input_stream_buffer *bsource, struct mfcc_buffer *buf,
+void mfcc_source_copy_s32(struct sof_source *source, struct mfcc_buffer *buf,
 			  struct mfcc_pre_emph *emph, int frames, int source_channel);
+#endif
+
+#if CONFIG_IPC_MAJOR_4
+int mfcc_ipc_notification_init(struct processing_module *mod);
+
+void mfcc_send_vad_notification(struct processing_module *mod, uint32_t val);
 
-void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer *bsource,
-		      struct output_stream_buffer *bsink, int frames);
+int mfcc_get_config(struct processing_module *mod,
+		    uint32_t config_id, uint32_t *data_offset_size,
+		    uint8_t *fragment, size_t fragment_size);
+
+int mfcc_set_config(struct processing_module *mod, uint32_t config_id,
+		    enum module_cfg_fragment_position pos, uint32_t data_offset_size,
+		    const uint8_t *fragment, size_t fragment_size, uint8_t *response,
+		    size_t response_size);
+
+#else
+static inline int mfcc_ipc_notification_init(struct processing_module *mod)
+{
+	return 0;
+}
+
+static inline void mfcc_send_vad_notification(struct processing_module *mod, uint32_t val)
+{
+}
+
+static inline int mfcc_get_config(struct processing_module *mod,
+				  uint32_t config_id, uint32_t *data_offset_size,
+				  uint8_t *fragment, size_t fragment_size)
+{
+	struct sof_ipc_ctrl_data *cdata = (struct sof_ipc_ctrl_data *)fragment;
+	struct mfcc_comp_data *cd = module_get_private_data(mod);
+
+	return comp_data_blob_get_cmd(cd->model_handler, cdata, fragment_size);
+}
+
+static inline int mfcc_set_config(struct processing_module *mod, uint32_t config_id,
+				  enum module_cfg_fragment_position pos, uint32_t data_offset_size,
+				  const uint8_t *fragment, size_t fragment_size, uint8_t *response,
+				  size_t response_size)
+{
+	struct mfcc_comp_data *cd = module_get_private_data(mod);
+
+	return comp_data_blob_set(cd->model_handler, pos, data_offset_size,
+				  fragment, fragment_size);
+}
 #endif
 
 #ifdef UNIT_TEST
diff --git a/src/include/sof/audio/mfcc/mfcc_vad.h b/src/include/sof/audio/mfcc/mfcc_vad.h
new file mode 100644
index 000000000000..6873343d334e
--- /dev/null
+++ b/src/include/sof/audio/mfcc/mfcc_vad.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright(c) 2026 Intel Corporation.
+ *
+ * Author: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
+ */
+
+/**
+ * \file mfcc_vad.h
+ * \brief Voice Activity Detection based on Mel spectrum energy.
+ *
+ * This VAD operates on the Q9.23 Mel log spectrum values produced by
+ * the MFCC component. It tracks a per-bin noise floor that follows
+ * the signal downward instantly and rises slowly, then computes a
+ * speech-weighted energy delta above the floor.
+ */
+
+#ifndef __SOF_AUDIO_MFCC_MFCC_VAD_H__
+#define __SOF_AUDIO_MFCC_MFCC_VAD_H__
+
+#include <stdint.h>
+#include <stdbool.h>
+
+struct processing_module;
+
+/**
+ * \brief Number of frames for fast noise floor convergence at startup (~1 s at 10 ms/frame).
+ */
+#define MFCC_VAD_NOISE_INIT_FRAMES	100
+
+/**
+ * \brief Slow noise floor rise coefficient in Q1.15 (0.003 * 2^15).
+ */
+#define MFCC_VAD_NOISE_RISE_ALPHA	98
+
+/**
+ * \brief Fast noise floor rise coefficient in Q1.15 (0.020 * 2^15).
+ */
+#define MFCC_VAD_NOISE_RISE_ALPHA_FAST	655
+
+/**
+ * \brief Energy threshold for speech detection in Q9.23 (0.30 * 2^23).
+ */
+#define MFCC_VAD_ENERGY_THRESHOLD	2516582
+
+/**
+ * \brief Hangover frame count to keep VAD active after last speech detection.
+ */
+#define MFCC_VAD_HANGOVER_FRAMES	20
+
+/**
+ * \brief VAD state structure.
+ */
+struct mfcc_vad_state {
+	int32_t *noise_floor; /**< Per-bin noise floor in Q9.23 */
+	int16_t *weights; /**< Speech-frequency emphasis weights Q1.15 */
+	int32_t energy; /**< Weighted signal energy in Q9.23 */
+	int32_t energy_threshold; /**< Energy threshold Q9.23 */
+	int32_t noise_energy; /**< Weighted noise floor energy in Q9.23 */
+	int16_t frame_count; /**< Initial convergence frames processed */
+	int16_t hangover_counter; /**< Current hangover counter */
+	int16_t hangover_max; /**< Maximum hangover frames */
+	int16_t init_frames; /**< Number of initial frames for fast convergence */
+	int16_t noise_rise_alpha_fast; /**< Fast rise alpha Q1.15 */
+	int16_t noise_rise_alpha_slow; /**< Slow rise alpha Q1.15 */
+	int16_t num_mel_bins; /**< Number of Mel bins in use */
+	bool initialized; /**< True after first frame processed */
+	bool is_speech; /**< Current VAD decision */
+};
+
+/**
+ * \brief Initialize VAD state.
+ *
+ * \param[out] vad Pointer to VAD state to initialize.
+ * \param[in] num_mel_bins Number of Mel bins.
+ * \param[in] sample_rate Audio sample rate in Hz.
+ * \param[in] mod Processing module for memory allocation.
+ * \return 0 on success, negative error code on failure.
+ */
+int mfcc_vad_init(struct mfcc_vad_state *vad, int num_mel_bins, int32_t sample_rate,
+		  struct processing_module *mod);
+
+/**
+ * \brief Process one Mel spectrum frame and update VAD decision.
+ *
+ * \param[in,out] vad Pointer to VAD state.
+ * \param[in] mel_log Mel log spectrum in Q9.23, array of num_mel_bins values.
+ * \return 1 if speech detected, 0 if silence.
+ */
+int mfcc_vad_update(struct mfcc_vad_state *vad, const int32_t *mel_log);
+
+#endif /* __SOF_AUDIO_MFCC_MFCC_VAD_H__ */
diff --git a/src/include/user/mfcc.h b/src/include/user/mfcc.h
index 8a0defcd9883..286ee4f5e985 100644
--- a/src/include/user/mfcc.h
+++ b/src/include/user/mfcc.h
@@ -54,7 +54,9 @@ struct sof_mfcc_config {
 	int16_t mel_scale; /**< Q4.12 default 1.0, use 0.25 for Whisper */
 	int16_t mmax_init; /**< Q8.7 default 0, with dynamic_mmax false, can sim. Whisper mmax */
 	int16_t mmax_coef; /**< Q1.15 decay coefficient for dynamic mmax, a small value for slow */
-	uint32_t reserved[6];
+	uint16_t dtx_trailing_silence_hops; /**< DTX: number of silence hops to send after speech, 0 = send first only */
+	uint16_t dtx_silence_hops_interval; /**< DTX: send silence frame every Nth hop during VAD=0, 0 = disable */
+	uint32_t reserved[5];
 	int32_t sample_frequency; /**< Hz. e.g. 16000 */
 	int32_t pmin; /**< Q1.31 linear power, limit minimum Mel energy, e.g. 1e-9 */
 	enum sof_mfcc_mel_log_type mel_log; /**< Use MEL_LOG_IS_LOG, LOG10 or DB*/
@@ -77,6 +79,7 @@ struct sof_mfcc_config {
 	int16_t vtln_high; /**< Reserved, no support */
 	int16_t vtln_low; /**< Reserved, no support */
 	int16_t vtln_warp; /**< Reserved, no support */
+	int16_t reserved16[3]; /**< Reserved for future 16-bit fields, set to 0 */
 	bool htk_compat; /**< Must be false */
 	bool raw_energy; /**< Reserved, no support */
 	bool remove_dc_offset; /**< Reserved, no support */
@@ -85,8 +88,11 @@ struct sof_mfcc_config {
 	bool subtract_mean; /**< Must be false (0) */
 	bool use_energy; /**< Must be false (0) */
 	bool dynamic_mmax; /**< Track max Mel value for clamp with top_db value */
-	bool reserved_bool2;
-	bool reserved_bool3;
+	bool enable_vad; /**< Run VAD algorithm */
+	bool enable_dtx; /**< Discontinuous transmission: suppress silence after trailing frames */
+	bool update_controls; /**< Update controls with VAD decision */
+	bool compress_output; /**< Use compress PCM output: variable size, no zero padding */
+	bool reserved_bool[4]; /* Reserved for future boolean flags, set to false (0) */
 } __attribute__((packed));
 
 #endif /* __USER_MFCC_H__ */
diff --git a/tools/topology/topology2/cavs-sdw.conf b/tools/topology/topology2/cavs-sdw.conf
index 6932543c06e5..0f597ded3793 100644
--- a/tools/topology/topology2/cavs-sdw.conf
+++ b/tools/topology/topology2/cavs-sdw.conf
@@ -254,6 +254,14 @@ IncludeByKey.SDW_JACK_AUDIO_FEATURE_CAPTURE {
 	"true" "platform/intel/sdw-jack-audio-feature.conf"
 }
 
+IncludeByKey.SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE {
+	"true" "platform/intel/sdw-jack-audio-feature-compress.conf"
+}
+
 IncludeByKey.SDW_DMIC_AUDIO_FEATURE_CAPTURE {
 	"true" "platform/intel/sdw-dmic-audio-feature.conf"
 }
+
+IncludeByKey.SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE {
+	"true" "platform/intel/sdw-dmic-audio-feature-compress.conf"
+}
diff --git a/tools/topology/topology2/development/tplg-targets.cmake b/tools/topology/topology2/development/tplg-targets.cmake
index a906852d04f0..155176c16347 100644
--- a/tools/topology/topology2/development/tplg-targets.cmake
+++ b/tools/topology/topology2/development/tplg-targets.cmake
@@ -479,11 +479,33 @@ SDW_AMP_FEEDBACK=false,SDW_SPK_STREAM=Playback-SmartAmp,SDW_DMIC_STREAM=Capture-
 SDW_JACK_OUT_STREAM=Playback-SimpleJack,SDW_JACK_IN_STREAM=Capture-SimpleJack,COMPRESSED=true"
 
 # Soundwire topologies with MFCC audio features capture
-"cavs-sdw\;sof-mtl-rt713-l0-rt1316-l12-mfcc\;PLATFORM=mtl,NUM_SDW_AMP_LINKS=2,\
+"cavs-sdw\;sof-mtl-rt713-l0-rt1316-l12-mfcc-mel-normal\;PLATFORM=mtl,NUM_SDW_AMP_LINKS=2,\
 HDMI1_ID=4,HDMI2_ID=5,HDMI3_ID=6,SDW_JACK_AUDIO_FEATURE_CAPTURE=true"
 
-"cavs-sdw\;sof-arl-cs42l43-l0-cs35l56-l23-mfcc\;PLATFORM=mtl,NUM_SDW_AMP_LINKS=2,SDW_DMIC=1,\
+"cavs-sdw\;sof-arl-cs42l43-l0-cs35l56-l23-mfcc-mel-normal\;PLATFORM=mtl,NUM_SDW_AMP_LINKS=2,SDW_DMIC=1,\
 SDW_AMP_FEEDBACK=false,SDW_SPK_STREAM=Playback-SmartAmp,SDW_DMIC_STREAM=Capture-SmartMic,\
 SDW_JACK_OUT_STREAM=Playback-SimpleJack,SDW_JACK_IN_STREAM=Capture-SimpleJack,\
 SDW_JACK_AUDIO_FEATURE_CAPTURE=true,SDW_DMIC_AUDIO_FEATURE_CAPTURE=true"
+
+# Soundwire topologies with compress MFCC mel audio features capture
+"cavs-sdw\;sof-mtl-rt713-l0-rt1316-l12-mfcc-mel-compr\;PLATFORM=mtl,NUM_SDW_AMP_LINKS=2,\
+HDMI1_ID=4,HDMI2_ID=5,HDMI3_ID=6,SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE=true,\
+MFCC_FRAME_BYTES=344,MFCC_BLOB=mel"
+
+# Soundwire topologies with compress MFCC cepstral audio features capture
+"cavs-sdw\;sof-mtl-rt713-l0-rt1316-l12-mfcc-ceps-compr\;PLATFORM=mtl,NUM_SDW_AMP_LINKS=2,\
+HDMI1_ID=4,HDMI2_ID=5,HDMI3_ID=6,SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE=true,\
+MFCC_FRAME_BYTES=76,MFCC_BLOB=ceps"
+
+"cavs-sdw\;sof-arl-cs42l43-l0-cs35l56-l23-mfcc-mel-compr\;PLATFORM=mtl,NUM_SDW_AMP_LINKS=2,SDW_DMIC=1,\
+SDW_AMP_FEEDBACK=false,SDW_SPK_STREAM=Playback-SmartAmp,SDW_DMIC_STREAM=Capture-SmartMic,\
+SDW_JACK_OUT_STREAM=Playback-SimpleJack,SDW_JACK_IN_STREAM=Capture-SimpleJack,\
+SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE=true,SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE=true,\
+MFCC_FRAME_BYTES=344,MFCC_BLOB=mel"
+
+"cavs-sdw\;sof-arl-cs42l43-l0-cs35l56-l23-mfcc-ceps-compr\;PLATFORM=mtl,NUM_SDW_AMP_LINKS=2,SDW_DMIC=1,\
+SDW_AMP_FEEDBACK=false,SDW_SPK_STREAM=Playback-SmartAmp,SDW_DMIC_STREAM=Capture-SmartMic,\
+SDW_JACK_OUT_STREAM=Playback-SimpleJack,SDW_JACK_IN_STREAM=Capture-SimpleJack,\
+SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE=true,SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE=true,\
+MFCC_FRAME_BYTES=76,MFCC_BLOB=ceps"
 )
diff --git a/tools/topology/topology2/include/bench/mfcc_controls_capture.conf b/tools/topology/topology2/include/bench/mfcc_controls_capture.conf
index d45baec1ee8f..8788387ec8c7 100644
--- a/tools/topology/topology2/include/bench/mfcc_controls_capture.conf
+++ b/tools/topology/topology2/include/bench/mfcc_controls_capture.conf
@@ -9,9 +9,9 @@
 						"mel80" "include/components/mfcc/mel80.conf"
 					}
 				}
-				#mixer."1" {
-				#	name '$ANALOG_CAPTURE_PCM MFCC switch or volume'
-				#}
+				mixer."1" {
+					name '$ANALOG_CAPTURE_PCM MFCC switch'
+				}
 				#enum."1" {
 				#	name '$ANALOG_CAPTURE_PCM MFCC enum'
 				#}
diff --git a/tools/topology/topology2/include/bench/mfcc_controls_playback.conf b/tools/topology/topology2/include/bench/mfcc_controls_playback.conf
index cc2ada04b8d7..007dbb91cd4f 100644
--- a/tools/topology/topology2/include/bench/mfcc_controls_playback.conf
+++ b/tools/topology/topology2/include/bench/mfcc_controls_playback.conf
@@ -9,9 +9,9 @@
 						"mel80" "include/components/mfcc/mel80.conf"
 					}
 				}
-				#mixer."1" {
-				#	name '$ANALOG_PLAYBACK_PCM MFCC switch or volume'
-				#}
+				mixer."1" {
+					name '$ANALOG_PLAYBACK_PCM MFCC switch'
+				}
 				#enum."1" {
 				#	name '$ANALOG_PLAYBACK_PCM MFCC enum'
 				#}
diff --git a/tools/topology/topology2/include/common/common_definitions.conf b/tools/topology/topology2/include/common/common_definitions.conf
index 87c69dd41e41..06f0f425c5e2 100644
--- a/tools/topology/topology2/include/common/common_definitions.conf
+++ b/tools/topology/topology2/include/common/common_definitions.conf
@@ -72,5 +72,7 @@ Define {
 	SDW_JACK_ECHO_REF			false # No echo reference for 3.5mm jack
 	SDW_SPK_ECHO_REF			false # No echo reference for speaker
 	SDW_JACK_AUDIO_FEATURE_CAPTURE		false # No audio features capture for jack
+	SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE	false # No compress audio features capture for jack
 	SDW_DMIC_AUDIO_FEATURE_CAPTURE		false # No audio features capture for microphone
+	SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE	false # No compress audio features capture for microphone
 }
diff --git a/tools/topology/topology2/include/components/mfcc.conf b/tools/topology/topology2/include/components/mfcc.conf
index 221df8f2d437..bf908e685048 100644
--- a/tools/topology/topology2/include/components/mfcc.conf
+++ b/tools/topology/topology2/include/components/mfcc.conf
@@ -13,6 +13,8 @@
 #
 # Where M is pipeline ID and N is a unique integer in the parent object.
 
+<include/controls/mixer.conf>
+
 Class.Widget."mfcc" {
 	#
 	# Pipeline ID
@@ -45,7 +47,6 @@ Class.Widget."mfcc" {
 
 		!immutable [
 			"uuid"
-			"type"
 		]
 		!deprecated [
 			"preload_count"
@@ -53,6 +54,26 @@ Class.Widget."mfcc" {
 		unique	"instance"
 	}
 
+	#
+	# MFCC Widget switch control to optionally notify VAD state changes
+	#
+	Object.Control {
+		mixer."1" {
+			Object.Base.channel.1 {
+				name	"fc"
+				shift	0
+			}
+			Object.Base.ops.1 {
+				name	"ctl"
+				info	"volsw"
+				#259 binds the mixer control to switch get/put handlers
+				get	259
+				put	259
+			}
+			max 1
+		}
+	}
+
 	#
 	# Default attributes for mfcc
 	#
diff --git a/tools/topology/topology2/include/components/mfcc/ceps13_compress_dtx.conf b/tools/topology/topology2/include/components/mfcc/ceps13_compress_dtx.conf
new file mode 100644
index 000000000000..7056b9e7cb4b
--- /dev/null
+++ b/tools/topology/topology2/include/components/mfcc/ceps13_compress_dtx.conf
@@ -0,0 +1,24 @@
+# Exported MFCC configuration 26-May-2026
+# cd src/audio/mfcc/tune; octave setup_mfcc.m
+Object.Base.data."mfcc_config" {
+	bytes "
+		0x53,0x4f,0x46,0x34,0x00,0x00,0x00,0x00,
+		0x74,0x00,0x00,0x00,0x01,0xd0,0x01,0x03,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x74,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x14,0x00,0xf4,0x01,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x80,0x3e,0x00,0x00,
+		0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x02,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
+		0xc3,0x35,0x00,0x2c,0x00,0x00,0x00,0x00,
+		0x90,0x01,0xa0,0x00,0x00,0x00,0x14,0x00,
+		0x0d,0x00,0x17,0x00,0x00,0x00,0x00,0x64,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x01,0x01,0x01,
+		0x01,0x00,0x00,0x00,0x01,0x01,0x01,0x01,
+		0x00,0x00,0x00,0x00"
+}
diff --git a/tools/topology/topology2/include/components/mfcc/default.conf b/tools/topology/topology2/include/components/mfcc/default.conf
index 42a6d6608b8b..0ac19fa71d04 100644
--- a/tools/topology/topology2/include/components/mfcc/default.conf
+++ b/tools/topology/topology2/include/components/mfcc/default.conf
@@ -1,12 +1,12 @@
-# Exported MFCC configuration 05-May-2026
+# Exported MFCC configuration 26-May-2026
 # cd src/audio/mfcc/tune; octave setup_mfcc.m
 Object.Base.data."mfcc_config" {
 	bytes "
 		0x53,0x4f,0x46,0x34,0x00,0x00,0x00,0x00,
-		0x68,0x00,0x00,0x00,0x01,0xd0,0x01,0x03,
+		0x74,0x00,0x00,0x00,0x01,0xd0,0x01,0x03,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-		0x68,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x74,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
@@ -17,6 +17,8 @@ Object.Base.data."mfcc_config" {
 		0xc3,0x35,0x00,0x2c,0x00,0x00,0x00,0x00,
 		0x90,0x01,0xa0,0x00,0x00,0x00,0x14,0x00,
 		0x0d,0x00,0x17,0x00,0x00,0x00,0x00,0x64,
-		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,
-		0x01,0x01,0x01,0x00,0x00,0x00,0x00,0x00"
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x01,0x01,0x01,
+		0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00"
 }
diff --git a/tools/topology/topology2/include/components/mfcc/mel80.conf b/tools/topology/topology2/include/components/mfcc/mel80.conf
index 04aa2a15c660..b18baadd459b 100644
--- a/tools/topology/topology2/include/components/mfcc/mel80.conf
+++ b/tools/topology/topology2/include/components/mfcc/mel80.conf
@@ -1,12 +1,12 @@
-# Exported MFCC configuration 05-May-2026
+# Exported MFCC configuration 26-May-2026
 # cd src/audio/mfcc/tune; octave setup_mfcc.m
 Object.Base.data."mfcc_config" {
 	bytes "
 		0x53,0x4f,0x46,0x34,0x00,0x00,0x00,0x00,
-		0x68,0x00,0x00,0x00,0x01,0xd0,0x01,0x03,
+		0x74,0x00,0x00,0x00,0x01,0xd0,0x01,0x03,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-		0x68,0x00,0x00,0x00,0x00,0x02,0x00,0x04,
+		0x74,0x00,0x00,0x00,0x00,0x02,0x00,0x04,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
@@ -18,5 +18,7 @@ Object.Base.data."mfcc_config" {
 		0x90,0x01,0xa0,0x00,0x40,0x1f,0x00,0x00,
 		0x00,0x00,0x50,0x00,0x00,0x00,0x00,0x04,
 		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-		0x00,0x01,0x01,0x00,0x00,0x01,0x00,0x00"
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,
+		0x01,0x00,0x00,0x01,0x01,0x00,0x01,0x00,
+		0x00,0x00,0x00,0x00"
 }
diff --git a/tools/topology/topology2/include/components/mfcc/mel80_compress.conf b/tools/topology/topology2/include/components/mfcc/mel80_compress.conf
new file mode 100644
index 000000000000..f26f2af6980c
--- /dev/null
+++ b/tools/topology/topology2/include/components/mfcc/mel80_compress.conf
@@ -0,0 +1,24 @@
+# Exported MFCC configuration 26-May-2026
+# cd src/audio/mfcc/tune; octave setup_mfcc.m
+Object.Base.data."mfcc_config" {
+	bytes "
+		0x53,0x4f,0x46,0x34,0x00,0x00,0x00,0x00,
+		0x74,0x00,0x00,0x00,0x01,0xd0,0x01,0x03,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x74,0x00,0x00,0x00,0x00,0x02,0x00,0x04,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x80,0x3e,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
+		0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x03,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x90,0x01,0xa0,0x00,0x40,0x1f,0x00,0x00,
+		0x00,0x00,0x50,0x00,0x00,0x00,0x00,0x04,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,
+		0x01,0x00,0x00,0x01,0x01,0x00,0x01,0x01,
+		0x00,0x00,0x00,0x00"
+}
diff --git a/tools/topology/topology2/include/components/mfcc/mel80_compress_dtx.conf b/tools/topology/topology2/include/components/mfcc/mel80_compress_dtx.conf
new file mode 100644
index 000000000000..d225811ca4d1
--- /dev/null
+++ b/tools/topology/topology2/include/components/mfcc/mel80_compress_dtx.conf
@@ -0,0 +1,24 @@
+# Exported MFCC configuration 26-May-2026
+# cd src/audio/mfcc/tune; octave setup_mfcc.m
+Object.Base.data."mfcc_config" {
+	bytes "
+		0x53,0x4f,0x46,0x34,0x00,0x00,0x00,0x00,
+		0x74,0x00,0x00,0x00,0x01,0xd0,0x01,0x03,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x74,0x00,0x00,0x00,0x00,0x02,0x00,0x04,
+		0x00,0x00,0x00,0x00,0x14,0x00,0xf4,0x01,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x80,0x3e,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
+		0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x03,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x90,0x01,0xa0,0x00,0x40,0x1f,0x00,0x00,
+		0x00,0x00,0x50,0x00,0x00,0x00,0x00,0x04,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,
+		0x01,0x00,0x00,0x01,0x01,0x01,0x01,0x01,
+		0x00,0x00,0x00,0x00"
+}
diff --git a/tools/topology/topology2/include/pipelines/cavs/host-gateway-src-mfcc-capture.conf b/tools/topology/topology2/include/pipelines/cavs/host-gateway-src-mfcc-capture.conf
index 793f71b883ab..fe6249018ef1 100644
--- a/tools/topology/topology2/include/pipelines/cavs/host-gateway-src-mfcc-capture.conf
+++ b/tools/topology/topology2/include/pipelines/cavs/host-gateway-src-mfcc-capture.conf
@@ -22,6 +22,12 @@
 <include/components/src.conf>
 <include/components/mfcc.conf>
 
+Define {
+	# Default MFCC output frame size (header + coefficients).
+	# Can be overridden by feature/platform includes or CMake variable overrides.
+	MFCC_FRAME_BYTES 344
+}
+
 Class.Pipeline."host-gateway-src-mfcc-capture" {
 
 	<include/pipelines/pipeline-common.conf>
@@ -85,6 +91,9 @@ Class.Pipeline."host-gateway-src-mfcc-capture" {
 					out_bit_depth		32
 					out_valid_bit_depth	32
 					out_rate		16000
+					# Compress output frame: header + coefficients.
+					# Size set by MFCC_FRAME_BYTES Define.
+					obs			$MFCC_FRAME_BYTES
 				}
 			]
 		}
@@ -101,6 +110,8 @@ Class.Pipeline."host-gateway-src-mfcc-capture" {
 					in_bit_depth		32
 					in_valid_bit_depth	32
 					in_rate			16000
+					# Match MFCC compress output frame size
+					ibs			$MFCC_FRAME_BYTES
 				}
 			]
 			Object.Base.output_audio_format [
@@ -108,6 +119,7 @@ Class.Pipeline."host-gateway-src-mfcc-capture" {
 					out_bit_depth		32
 					out_valid_bit_depth	32
 					out_rate		16000
+					obs			$MFCC_FRAME_BYTES
 				}
 			]
 		}
diff --git a/tools/topology/topology2/platform/intel/dmic1-mfcc.conf b/tools/topology/topology2/platform/intel/dmic1-mfcc.conf
index f3926a283a8b..3aad756a85f5 100644
--- a/tools/topology/topology2/platform/intel/dmic1-mfcc.conf
+++ b/tools/topology/topology2/platform/intel/dmic1-mfcc.conf
@@ -454,11 +454,14 @@ Object.Widget.mfcc.1 {
 	index		$DMIC1_HOST_PIPELINE_ID
 	Object.Control {
 		bytes."1" {
-			name 'Analog Capture TDFB bytes'
+			name "Dmic1 Capture MFCC bytes"
 			IncludeByKey.DMIC1_MFCC_PARAMS {
 				"default" "include/components/mfcc/default.conf"
 			}
 		}
+		mixer."1" {
+			name "Dmic1 Capture MFCC VAD"
+		}
 	}
 	IncludeByKey.NUM_DMICS {
 		"1" {
diff --git a/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature-compress.conf b/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature-compress.conf
new file mode 100644
index 000000000000..9e307043830b
--- /dev/null
+++ b/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature-compress.conf
@@ -0,0 +1,71 @@
+Define {
+	SDW_DMIC_MODULE_COPIER_ID 41
+	SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PCM_NAME "Microphone Compress Audio Features"
+	SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PCM_ID 54
+	SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_STREAM_NAME "Microphone Compress Audio Features Stream"
+	SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PIPELINE_ID 133
+	# MFCC compress output frame size in bytes:
+	# Mel-only (80 bins): 24 + 80*4 = 344
+	# Cepstral (13 ceps): 24 + 13*4 = 76
+	MFCC_FRAME_BYTES 344
+	# MFCC config blob: mel or ceps
+	MFCC_BLOB mel
+}
+
+Object.Pipeline.host-gateway-src-mfcc-capture [
+	{
+		index $SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PIPELINE_ID
+
+		Object.Widget.host-copier.1 {
+			stream_name "$SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_STREAM_NAME"
+			pcm_id $SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PCM_ID
+		}
+
+		Object.Widget.mfcc.1 {
+			type "encoder"
+			Object.Control {
+				bytes."1" {
+					name "$SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC bytes"
+					IncludeByKey.MFCC_BLOB {
+						"mel"	"include/components/mfcc/mel80_compress_dtx.conf"
+						"ceps"	"include/components/mfcc/ceps13_compress_dtx.conf"
+					}
+				}
+				mixer."1" {
+					name "$SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC VAD"
+				}
+			}
+		}
+	}
+]
+Object.Base.route [
+	{
+		source "module-copier.$SDW_DMIC_MODULE_COPIER_ID.0"
+		sink "src.$SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PIPELINE_ID.1"
+	}
+	{
+		source "mfcc.$SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PIPELINE_ID.1"
+		sink "host-copier.$SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PCM_ID.capture"
+	}
+]
+
+Object.PCM.pcm [
+	{
+		name "$SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PCM_NAME"
+		id $SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PCM_ID
+		direction "capture"
+		compress "true"
+
+		Object.Base.fe_dai.1 {
+			name "$SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_PCM_NAME"
+		}
+
+		Object.PCM.pcm_caps.1 {
+			name "$SDW_DMIC_COMPR_AUDIO_FEATURE_CAPTURE_STREAM_NAME"
+			formats 'S32_LE'
+			rates '16000'
+			channels_min 2
+			channels_max 2
+		}
+	}
+]
diff --git a/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature.conf b/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature.conf
index 87039b261597..7d39c11772c1 100644
--- a/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature.conf
+++ b/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature.conf
@@ -4,6 +4,9 @@ Define {
 	SDW_DMIC_AUDIO_FEATURE_CAPTURE_PCM_ID 48
 	SDW_DMIC_AUDIO_FEATURE_CAPTURE_STREAM_NAME "Microphone Audio Features Stream"
 	SDW_DMIC_AUDIO_FEATURE_CAPTURE_PIPELINE_ID 131
+	# MFCC output frame size in bytes (24-byte header + coefficients):
+	# Mel-only (80 bins): 24 + 80*4 = 344
+	MFCC_FRAME_BYTES 344
 }
 
 Object.Pipeline.host-gateway-src-mfcc-capture [
@@ -21,6 +24,9 @@ Object.Pipeline.host-gateway-src-mfcc-capture [
 					name "$SDW_DMIC_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC bytes"
 					<include/components/mfcc/mel80.conf>
 				}
+				mixer."1" {
+					name "$SDW_DMIC_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC VAD"
+				}
 			}
 		}
 	}
diff --git a/tools/topology/topology2/platform/intel/sdw-jack-audio-feature-compress.conf b/tools/topology/topology2/platform/intel/sdw-jack-audio-feature-compress.conf
new file mode 100644
index 000000000000..286af8be0323
--- /dev/null
+++ b/tools/topology/topology2/platform/intel/sdw-jack-audio-feature-compress.conf
@@ -0,0 +1,71 @@
+Define {
+	SDW_JACK_MODULE_COPIER_ID 11
+	SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PCM_NAME "Jack In Compress Audio Features"
+	SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PCM_ID 53
+	SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_STREAM_NAME "Jack In Compress Audio Features Stream"
+	SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PIPELINE_ID 132
+	# MFCC compress output frame size in bytes:
+	# Mel-only (80 bins): 24 + 80*4 = 344
+	# Cepstral (13 ceps): 24 + 13*4 = 76
+	MFCC_FRAME_BYTES 344
+	# MFCC config blob: mel or ceps
+	MFCC_BLOB mel
+}
+
+Object.Pipeline.host-gateway-src-mfcc-capture [
+	{
+		index $SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PIPELINE_ID
+
+		Object.Widget.host-copier.1 {
+			stream_name "$SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_STREAM_NAME"
+			pcm_id $SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PCM_ID
+		}
+
+		Object.Widget.mfcc.1 {
+			type "encoder"
+			Object.Control {
+				bytes."1" {
+					name "$SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC bytes"
+					IncludeByKey.MFCC_BLOB {
+						"mel"	"include/components/mfcc/mel80_compress_dtx.conf"
+						"ceps"	"include/components/mfcc/ceps13_compress_dtx.conf"
+					}
+				}
+				mixer."1" {
+					name "$SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC VAD"
+				}
+			}
+		}
+	}
+]
+Object.Base.route [
+	{
+		source "module-copier.$SDW_JACK_MODULE_COPIER_ID.0"
+		sink "src.$SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PIPELINE_ID.1"
+	}
+	{
+		source "mfcc.$SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PIPELINE_ID.1"
+		sink "host-copier.$SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PCM_ID.capture"
+	}
+]
+
+Object.PCM.pcm [
+	{
+		name "$SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PCM_NAME"
+		id $SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PCM_ID
+		direction "capture"
+		compress "true"
+
+		Object.Base.fe_dai.1 {
+			name "$SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_PCM_NAME"
+		}
+
+		Object.PCM.pcm_caps.1 {
+			name "$SDW_JACK_COMPR_AUDIO_FEATURE_CAPTURE_STREAM_NAME"
+			formats 'S32_LE'
+			rates '16000'
+			channels_min $SDW_JACK_CAPTURE_CH
+			channels_max $SDW_JACK_CAPTURE_CH
+		}
+	}
+]
diff --git a/tools/topology/topology2/platform/intel/sdw-jack-audio-feature.conf b/tools/topology/topology2/platform/intel/sdw-jack-audio-feature.conf
index 9645199d6907..a0a44eae4d87 100644
--- a/tools/topology/topology2/platform/intel/sdw-jack-audio-feature.conf
+++ b/tools/topology/topology2/platform/intel/sdw-jack-audio-feature.conf
@@ -4,6 +4,9 @@ Define {
 	SDW_JACK_AUDIO_FEATURE_CAPTURE_PCM_ID 47
 	SDW_JACK_AUDIO_FEATURE_CAPTURE_STREAM_NAME "Jack In Audio Features Stream"
 	SDW_JACK_AUDIO_FEATURE_CAPTURE_PIPELINE_ID 130
+	# MFCC output frame size in bytes (24-byte header + coefficients):
+	# Mel-only (80 bins): 24 + 80*4 = 344
+	MFCC_FRAME_BYTES 344
 }
 
 Object.Pipeline.host-gateway-src-mfcc-capture [
@@ -21,6 +24,9 @@ Object.Pipeline.host-gateway-src-mfcc-capture [
 					name "$SDW_JACK_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC bytes"
 					<include/components/mfcc/mel80.conf>
 				}
+				mixer."1" {
+					name "$SDW_JACK_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC VAD"
+				}
 			}
 		}
 	}