Audio: Mixin_mixout: Optimize the mixin processing functions

Use audio_stream_samples_without_wrap_sx to replace the complicated boundary detection for every sample and separate the mix functions to normal mode and remap mode. Normal mode (the default mode) can save at least 55.5% cycles for C version and at least 76.7% for HiFi3 version. Signed-off-by: Andrula Song <xiaoyuan.song@intel.com>
2022-11-17 11:04:03 +08:00 · 2022-11-17 11:04:03 +08:00 · 7f36f83333
parent 35f2e5f84f
commit 7f36f83333
7 changed files with 1131 additions and 312 deletions
--- a/src/audio/CMakeLists.txt
+++ b/src/audio/CMakeLists.txt
@ -3,7 +3,7 @@
 if(CONFIG_IPC_MAJOR_3)
 	set(mixer_src mixer/mixer.c mixer/mixer_generic.c mixer/mixer_hifi3.c)
 elseif(CONFIG_IPC_MAJOR_4)
-	set(mixer_src mixin_mixout.c)
+	set(mixer_src mixin_mixout/mixin_mixout.c mixin_mixout/mixin_mixout_generic.c mixin_mixout/mixin_mixout_hifi3.c)
 endif()

 if(NOT CONFIG_LIBRARY)
--- a/src/audio/mixin_mixout/CMakeLists.txt
+++ b/src/audio/mixin_mixout/CMakeLists.txt
@ -0,0 +1 @@
+add_local_sources(sof mixin_mixout.c mixin_mixout_generic.c mixin_mixout_hifi3.c)
--- a/src/audio/mixin_mixout/mixin_mixout.c
+++ b/src/audio/mixin_mixout/mixin_mixout.c
@ -116,16 +116,9 @@ struct mixin_data {
 	 * private data as ipc4_base_module_cfg!
 	 */
 	struct ipc4_base_module_cfg base_cfg;
-
-	void (*mix_channel)(struct audio_stream __sparse_cache *sink, uint8_t sink_channel_index,
-			    uint8_t sink_channel_count, uint32_t start_frame, uint32_t mixed_frames,
-			    const struct audio_stream __sparse_cache *source,
-			    uint8_t source_channel_index, uint8_t source_channel_count,
-			    uint32_t frame_count, uint16_t gain);
-
-	void (*mute_channel)(struct audio_stream __sparse_cache *stream, uint8_t channel_index,
-			     uint32_t start_frame, uint32_t mixed_frames, uint32_t frame_count);
-
+	normal_mix_func normal_mix_channel;
+	remap_mix_func remap_mix_channel;
+	mute_func mute_channel;
 	struct mixin_sink_config sink_config[MIXIN_MAX_SINKS];
 };

@ -278,287 +271,6 @@ static void reset_mixed_data_info_frame_counters(struct mixed_data_info __sparse
 		mdi->source_info[i].consumed_yet_not_produced_frames = 0;
 }

-#if CONFIG_FORMAT_S16LE
-/* Instead of using sink->channels and source->channels, sink_channel_count and
- * source_channel_count are supplied as parameters. This is done to reuse the function
- * to also mix an entire stream. In this case the function is called with fake stream
- * parameters: multichannel stream is treated as single channel and so the entire stream
- * contents is mixed.
- */
-static void mix_channel_s16(struct audio_stream __sparse_cache *sink, uint8_t sink_channel_index,
-			    uint8_t sink_channel_count, uint32_t start_frame, uint32_t mixed_frames,
-			    const struct audio_stream __sparse_cache *source,
-			    uint8_t source_channel_index, uint8_t source_channel_count,
-			    uint32_t frame_count, uint16_t gain)
-{
-	int16_t *dest, *src;
-	uint32_t frames_to_mix, frames_to_copy;
-
-	/* audio_stream_wrap() is required and is done below in a loop */
-	dest = (int16_t *)sink->w_ptr + start_frame * sink_channel_count + sink_channel_index;
-	src = (int16_t *)source->r_ptr + source_channel_index;
-
-	assert(mixed_frames >= start_frame);
-	frames_to_mix = MIN(mixed_frames - start_frame, frame_count);
-	frames_to_copy = frame_count - frames_to_mix;
-
-	/* we do not want to use something like audio_stream_frames_without_wrap() here
-	 * as it uses stream->channels internally. Also we would like to avoid expensive
-	 * division operations.
-	 */
-	while (frames_to_mix) {
-		src = audio_stream_wrap(source, src);
-		dest = audio_stream_wrap(sink, dest);
-
-		if (gain == IPC4_MIXIN_UNITY_GAIN)
-			while (frames_to_mix && src < (int16_t *)source->end_addr &&
-			       dest < (int16_t *)sink->end_addr) {
-				*dest = sat_int16((int32_t)*dest + (int32_t)*src);
-				src += source_channel_count;
-				dest += sink_channel_count;
-				frames_to_mix--;
-			}
-		else
-			while (frames_to_mix && src < (int16_t *)source->end_addr &&
-			       dest < (int16_t *)sink->end_addr) {
-				*dest = sat_int16((int32_t)*dest +
-						  q_mults_16x16(*src, gain,
-								IPC4_MIXIN_GAIN_SHIFT));
-				src += source_channel_count;
-				dest += sink_channel_count;
-				frames_to_mix--;
-			}
-	}
-
-	while (frames_to_copy) {
-		src = audio_stream_wrap(source, src);
-		dest = audio_stream_wrap(sink, dest);
-
-		if (gain == IPC4_MIXIN_UNITY_GAIN)
-			while (frames_to_copy && src < (int16_t *)source->end_addr &&
-			       dest < (int16_t *)sink->end_addr) {
-				*dest = *src;
-				src += source_channel_count;
-				dest += sink_channel_count;
-				frames_to_copy--;
-			}
-		else
-			while (frames_to_copy && src < (int16_t *)source->end_addr &&
-			       dest < (int16_t *)sink->end_addr) {
-				*dest = (int16_t)q_mults_16x16(*src, gain, IPC4_MIXIN_GAIN_SHIFT);
-				src += source_channel_count;
-				dest += sink_channel_count;
-				frames_to_copy--;
-			}
-	}
-}
-
-static void mute_channel_s16(struct audio_stream __sparse_cache *stream, uint8_t channel_index,
-			     uint32_t start_frame, uint32_t mixed_frames, uint32_t frame_count)
-{
-	uint32_t skip_mixed_frames;
-	int16_t *ptr;
-
-	assert(mixed_frames >= start_frame);
-	skip_mixed_frames = mixed_frames - start_frame;
-
-	if (frame_count <= skip_mixed_frames)
-		return;
-	frame_count -= skip_mixed_frames;
-
-	/* audio_stream_wrap() is needed here and it is just below in a loop */
-	ptr = (int16_t *)stream->w_ptr + mixed_frames * stream->channels + channel_index;
-
-	while (frame_count) {
-		ptr = audio_stream_wrap(stream, ptr);
-		while (frame_count && ptr < (int16_t *)stream->end_addr) {
-			*ptr = 0;
-			ptr += stream->channels;
-			frame_count--;
-		}
-	}
-}
-#endif	/* CONFIG_FORMAT_S16LE */
-
-#if CONFIG_FORMAT_S24LE
-/* Instead of using sink->channels and source->channels, sink_channel_count and
- * source_channel_count are supplied as parameters. This is done to reuse the function
- * to also mix an entire stream. In this case the function is called with fake stream
- * parameters: multichannel stream is treated as single channel and so the entire stream
- * contents is mixed.
- */
-static void mix_channel_s24(struct audio_stream __sparse_cache *sink, uint8_t sink_channel_index,
-			    uint8_t sink_channel_count, uint32_t start_frame, uint32_t mixed_frames,
-			    const struct audio_stream __sparse_cache *source,
-			    uint8_t source_channel_index, uint8_t source_channel_count,
-			    uint32_t frame_count, uint16_t gain)
-{
-	int32_t *dest, *src;
-	uint32_t frames_to_mix, frames_to_copy;
-
-	/* audio_stream_wrap() is required and is done below in a loop */
-	dest = (int32_t *)sink->w_ptr + start_frame * sink_channel_count + sink_channel_index;
-	src = (int32_t *)source->r_ptr + source_channel_index;
-
-	assert(mixed_frames >= start_frame);
-	frames_to_mix = MIN(mixed_frames - start_frame, frame_count);
-	frames_to_copy = frame_count - frames_to_mix;
-
-	/* we do not want to use something like audio_stream_frames_without_wrap() here
-	 * as it uses stream->channels internally. Also we would like to avoid expensive
-	 * division operations.
-	 */
-	while (frames_to_mix) {
-		src = audio_stream_wrap(source, src);
-		dest = audio_stream_wrap(sink, dest);
-
-		if (gain == IPC4_MIXIN_UNITY_GAIN)
-			while (frames_to_mix && src < (int32_t *)source->end_addr &&
-			       dest < (int32_t *)sink->end_addr) {
-				*dest = sat_int24(sign_extend_s24(*dest) + sign_extend_s24(*src));
-				src += source_channel_count;
-				dest += sink_channel_count;
-				frames_to_mix--;
-			}
-		else
-			while (frames_to_mix && src < (int32_t *)source->end_addr &&
-			       dest < (int32_t *)sink->end_addr) {
-				*dest = sat_int24(sign_extend_s24(*dest) +
-						  (int32_t)q_mults_32x32(sign_extend_s24(*src),
-						  gain, IPC4_MIXIN_GAIN_SHIFT));
-				src += source_channel_count;
-				dest += sink_channel_count;
-				frames_to_mix--;
-			}
-	}
-
-	while (frames_to_copy) {
-		src = audio_stream_wrap(source, src);
-		dest = audio_stream_wrap(sink, dest);
-
-		if (gain == IPC4_MIXIN_UNITY_GAIN)
-			while (frames_to_copy && src < (int32_t *)source->end_addr &&
-			       dest < (int32_t *)sink->end_addr) {
-				*dest = *src;
-				src += source_channel_count;
-				dest += sink_channel_count;
-				frames_to_copy--;
-			}
-		else
-			while (frames_to_copy && src < (int32_t *)source->end_addr &&
-			       dest < (int32_t *)sink->end_addr) {
-				*dest = (int32_t)q_mults_32x32(sign_extend_s24(*src),
-							       gain, IPC4_MIXIN_GAIN_SHIFT);
-				src += source_channel_count;
-				dest += sink_channel_count;
-				frames_to_copy--;
-			}
-	}
-}
-
-#endif	/* CONFIG_FORMAT_S24LE */
-
-#if CONFIG_FORMAT_S32LE
-/* Instead of using sink->channels and source->channels, sink_channel_count and
- * source_channel_count are supplied as parameters. This is done to reuse the function
- * to also mix an entire stream. In this case the function is called with fake stream
- * parameters: multichannel stream is treated as single channel and so the entire stream
- * contents is mixed.
- */
-static void mix_channel_s32(struct audio_stream __sparse_cache *sink, uint8_t sink_channel_index,
-			    uint8_t sink_channel_count, uint32_t start_frame, uint32_t mixed_frames,
-			    const struct audio_stream __sparse_cache *source,
-			    uint8_t source_channel_index, uint8_t source_channel_count,
-			    uint32_t frame_count, uint16_t gain)
-{
-	int32_t *dest, *src;
-	uint32_t frames_to_mix, frames_to_copy;
-
-	/* audio_stream_wrap() is required and is done below in a loop */
-	dest = (int32_t *)sink->w_ptr + start_frame * sink_channel_count + sink_channel_index;
-	src = (int32_t *)source->r_ptr + source_channel_index;
-
-	assert(mixed_frames >= start_frame);
-	frames_to_mix = MIN(mixed_frames - start_frame, frame_count);
-	frames_to_copy = frame_count - frames_to_mix;
-
-	/* we do not want to use something like audio_stream_frames_without_wrap() here
-	 * as it uses stream->channels internally. Also we would like to avoid expensive
-	 * division operations.
-	 */
-	while (frames_to_mix) {
-		src = audio_stream_wrap(source, src);
-		dest = audio_stream_wrap(sink, dest);
-
-		if (gain == IPC4_MIXIN_UNITY_GAIN)
-			while (frames_to_mix && src < (int32_t *)source->end_addr &&
-			       dest < (int32_t *)sink->end_addr) {
-				*dest = sat_int32((int64_t)*dest + (int64_t)*src);
-				src += source_channel_count;
-				dest += sink_channel_count;
-				frames_to_mix--;
-			}
-		else
-			while (frames_to_mix && src < (int32_t *)source->end_addr &&
-			       dest < (int32_t *)sink->end_addr) {
-				*dest = sat_int32((int64_t)*dest +
-						  q_mults_32x32(*src, gain, IPC4_MIXIN_GAIN_SHIFT));
-				src += source_channel_count;
-				dest += sink_channel_count;
-				frames_to_mix--;
-			}
-	}
-
-	while (frames_to_copy) {
-		src = audio_stream_wrap(source, src);
-		dest = audio_stream_wrap(sink, dest);
-
-		if (gain == IPC4_MIXIN_UNITY_GAIN)
-			while (frames_to_copy && src < (int32_t *)source->end_addr &&
-			       dest < (int32_t *)sink->end_addr) {
-				*dest = *src;
-				src += source_channel_count;
-				dest += sink_channel_count;
-				frames_to_copy--;
-			}
-		else
-			while (frames_to_copy && src < (int32_t *)source->end_addr &&
-			       dest < (int32_t *)sink->end_addr) {
-				*dest = (int32_t)q_mults_32x32(*src, gain, IPC4_MIXIN_GAIN_SHIFT);
-				src += source_channel_count;
-				dest += sink_channel_count;
-				frames_to_copy--;
-			}
-	}
-}
-
-static void mute_channel_s32(struct audio_stream __sparse_cache *stream, uint8_t channel_index,
-			     uint32_t start_frame, uint32_t mixed_frames, uint32_t frame_count)
-{
-	uint32_t skip_mixed_frames;
-	int32_t *ptr;
-
-	assert(mixed_frames >= start_frame);
-	skip_mixed_frames = mixed_frames - start_frame;
-
-	if (frame_count <= skip_mixed_frames)
-		return;
-	frame_count -= skip_mixed_frames;
-
-	/* audio_stream_wrap() is needed here and it is just below in a loop */
-	ptr = (int32_t *)stream->w_ptr + mixed_frames * stream->channels + channel_index;
-
-	while (frame_count) {
-		ptr = audio_stream_wrap(stream, ptr);
-		while (frame_count && ptr < (int32_t *)stream->end_addr) {
-			*ptr = 0;
-			ptr += stream->channels;
-			frame_count--;
-		}
-	}
-}
-#endif	/* CONFIG_FORMAT_S32LE */
-
 static int mix_and_remap(struct comp_dev *dev, const struct mixin_data *mixin_data,
 			 uint16_t sink_index, struct audio_stream __sparse_cache *sink,
 			 uint32_t start_frame, uint32_t mixed_frames,
@ -580,8 +292,8 @@ static int mix_and_remap(struct comp_dev *dev, const struct mixin_data *mixin_da
 		 * channel count is passed as 1, channel index is 0, frame indices (start_frame
 		 * and mixed_frame) and frame count are multiplied by real stream channel count.
 		 */
-		mixin_data->mix_channel(sink, 0, 1, start_frame * sink->channels,
-					mixed_frames * sink->channels, source, 0, 1,
+		mixin_data->normal_mix_channel(sink, start_frame * sink->channels,
+					mixed_frames * sink->channels, source,
 					frame_count * sink->channels, sink_config->gain);
 	} else if (sink_config->mixer_mode == IPC4_MIXER_CHANNEL_REMAPPING_MODE) {
 		int i;
@ -600,7 +312,7 @@ static int mix_and_remap(struct comp_dev *dev, const struct mixin_data *mixin_da
 						 source->channels);
 					return -EINVAL;
 				}
-				mixin_data->mix_channel(sink, i, sink->channels, start_frame,
+				mixin_data->remap_mix_channel(sink, i, sink->channels, start_frame,
 							mixed_frames, source, source_channel,
 							source->channels, frame_count,
 							sink_config->gain);
@ -952,7 +664,8 @@ static int mixin_reset(struct comp_dev *dev)
 	comp_dbg(dev, "mixin_reset()");

 	mixin_data = comp_get_drvdata(dev);
-	mixin_data->mix_channel = NULL;
+	mixin_data->normal_mix_channel = NULL;
+	mixin_data->remap_mix_channel = NULL;
 	mixin_data->mute_channel = NULL;

 	comp_set_state(dev, COMP_TRIGGER_RESET);
@ -1021,29 +734,23 @@ static int mixin_prepare(struct comp_dev *dev)

 	/* currently inactive so setup mixer */
 	switch (fmt) {
-#if CONFIG_FORMAT_S16LE
 	case SOF_IPC_FRAME_S16_LE:
-		md->mix_channel = mix_channel_s16;
-		md->mute_channel = mute_channel_s16;
-		break;
-#endif /* CONFIG_FORMAT_S16LE */
-#if CONFIG_FORMAT_S24LE
 	case SOF_IPC_FRAME_S24_4LE:
-		md->mix_channel = mix_channel_s24;
-		md->mute_channel = mute_channel_s32;	/* yes, 32 is correct */
-		break;
-#endif /* CONFIG_FORMAT_S24LE */
-#if CONFIG_FORMAT_S32LE
 	case SOF_IPC_FRAME_S32_LE:
-		md->mix_channel = mix_channel_s32;
-		md->mute_channel = mute_channel_s32;
+		md->normal_mix_channel = normal_mix_get_processing_function(fmt);
+		md->remap_mix_channel = remap_mix_get_processing_function(fmt);
+		md->mute_channel = mute_mix_get_processing_function(fmt);
 		break;
-#endif /* CONFIG_FORMAT_S32LE */
 	default:
 		comp_err(dev, "unsupported data format");
 		return -EINVAL;
 	}

+	if (!md->normal_mix_channel || !md->remap_mix_channel || !md->mute_channel) {
+		comp_err(dev, "have not found the suitable processing function");
+		return -EINVAL;
+	}
+
 	ret = comp_set_state(dev, COMP_TRIGGER_PREPARE);
 	if (ret < 0)
 		return ret;
@ -1322,7 +1029,7 @@ static int mixout_unbind(struct comp_dev *dev, void *data)

 		mixin = ipc4_get_comp_dev(src_id);
 		if (!mixin) {
-			comp_err(dev, "mixout_bind: no source with ID %d found", src_id);
+			comp_err(dev, "mixout_unbind: no source with ID %d found", src_id);
 			mixed_data_info_release(mixed_data_info);
 			return -EINVAL;
 		}
--- a/src/audio/mixin_mixout/mixin_mixout_generic.c
+++ b/src/audio/mixin_mixout/mixin_mixout_generic.c
@ -0,0 +1,415 @@
+// SPDX-License-Identifier: BSD-3-Clause
+//
+// Copyright(c) 2022 Intel Corporation. All rights reserved.
+//
+// Author: Andrula Song <xiaoyuan.song@intel.com>
+
+#include <ipc4/mixin_mixout.h>
+#include <sof/common.h>
+#include <rtos/string.h>
+
+#ifdef MIXIN_MIXOUT_GENERIC
+
+#if CONFIG_FORMAT_S16LE
+/* Instead of using sink->channels and source->channels, sink_channel_count and
+ * source_channel_count are supplied as parameters. This is done to reuse the function
+ * to also mix an entire stream. In this case the function is called with fake stream
+ * parameters: multichannel stream is treated as single channel and so the entire stream
+ * contents is mixed.
+ */
+static void normal_mix_channel_s16(struct audio_stream __sparse_cache *sink, int32_t start_frame,
+				   int32_t mixed_frames,
+				   const struct audio_stream __sparse_cache *source,
+				   int32_t frame_count, uint16_t gain)
+{
+	int32_t frames_to_mix, frames_to_copy, left_frames;
+	int32_t n, nmax, i;
+
+	/* audio_stream_wrap() is required and is done below in a loop */
+	int16_t *dst = (int16_t *)sink->w_ptr + start_frame;
+	int16_t *src = (int16_t *)source->r_ptr;
+
+	assert(mixed_frames >= start_frame);
+	frames_to_mix = mixed_frames - start_frame;
+	frames_to_mix = MIN(frames_to_mix, frame_count);
+	frames_to_copy = frame_count - frames_to_mix;
+
+	for (left_frames = frames_to_mix; left_frames > 0; left_frames -= n) {
+		src = audio_stream_wrap(source, src);
+		dst = audio_stream_wrap(sink, dst);
+		/* calculate the remaining samples*/
+		nmax = audio_stream_samples_without_wrap_s16(source, src);
+		n = MIN(left_frames, nmax);
+		nmax = audio_stream_samples_without_wrap_s16(sink, dst);
+		n = MIN(n, nmax);
+		for (i = 0; i < n; i++) {
+			*dst = sat_int16(*dst + *src++);
+			dst++;
+		}
+	}
+
+	for (left_frames = frames_to_copy; left_frames > 0; left_frames -= n) {
+		src = audio_stream_wrap(source, src);
+		dst = audio_stream_wrap(sink, dst);
+		nmax = audio_stream_samples_without_wrap_s16(source, src);
+		n = MIN(left_frames, nmax);
+		nmax = audio_stream_samples_without_wrap_s16(sink, dst);
+		n = MIN(n, nmax);
+		memcpy_s(dst, n * sizeof(int16_t), src, n * sizeof(int16_t));
+	}
+}
+
+static void remap_mix_channel_s16(struct audio_stream __sparse_cache *sink,
+				  int32_t sink_channel_index,  int32_t sink_channel_count,
+				  int32_t start_frame, int32_t mixed_frames,
+				  const struct audio_stream __sparse_cache *source,
+				  int32_t source_channel_index, int32_t source_channel_count,
+				  int32_t frame_count, uint16_t gain)
+{
+	int16_t *dst, *src;
+	int32_t frames_to_mix, frames_to_copy, left_frames;
+	int32_t n, nmax, frames, i, samples;
+
+	/* audio_stream_wrap() is required and is done below in a loop */
+	dst = (int16_t *)sink->w_ptr + start_frame * sink_channel_count + sink_channel_index;
+	src = (int16_t *)source->r_ptr + source_channel_index;
+
+	assert(mixed_frames >= start_frame);
+	frames_to_mix = mixed_frames - start_frame;
+	frames_to_mix = MIN(frames_to_mix, frame_count);
+	frames_to_copy = frame_count - frames_to_mix;
+
+	for (left_frames = frames_to_mix; left_frames > 0; left_frames -= frames) {
+		src = audio_stream_wrap(source, src);
+		dst = audio_stream_wrap(sink, dst);
+		/* calculate the remaining samples*/
+		nmax = audio_stream_samples_without_wrap_s16(source, src);
+		samples = left_frames * source_channel_count;
+		n = MIN(samples, nmax);
+		nmax = audio_stream_samples_without_wrap_s16(sink, dst);
+		n = MIN(n, nmax);
+		/* frames is the processed frame count in this loop*/
+		frames = 0;
+		for (i = 0; i < n; i += source_channel_count) {
+			*dst = sat_int16((int32_t)*dst +
+			       q_mults_16x16(*src, gain, IPC4_MIXIN_GAIN_SHIFT));
+			src += source_channel_count;
+			dst += sink_channel_count;
+			frames++;
+		}
+	}
+
+	for (left_frames = frames_to_copy; left_frames > 0; left_frames -= frames) {
+		src = audio_stream_wrap(source, src);
+		dst = audio_stream_wrap(sink, dst);
+		nmax = audio_stream_samples_without_wrap_s16(source, src);
+		samples = left_frames * source_channel_count;
+		n = MIN(samples, nmax);
+		nmax = audio_stream_samples_without_wrap_s16(sink, dst);
+		n = MIN(n, nmax);
+		frames = 0;
+		for (i = 0; i < n; i += source_channel_count) {
+			*dst = (int16_t)q_mults_16x16(*src, gain, IPC4_MIXIN_GAIN_SHIFT);
+			src += source_channel_count;
+			dst += sink_channel_count;
+			frames++;
+		}
+	}
+}
+
+static void mute_channel_s16(struct audio_stream __sparse_cache *stream, int32_t channel_index,
+			     int32_t start_frame, int32_t mixed_frames, int32_t frame_count)
+{
+	int32_t skip_mixed_frames, n, left_frames, i, channel_count, frames, samples;
+	int16_t *ptr;
+
+	assert(mixed_frames >= start_frame);
+	skip_mixed_frames = mixed_frames - start_frame;
+
+	if (frame_count <= skip_mixed_frames)
+		return;
+	frame_count -= skip_mixed_frames;
+	channel_count = stream->channels;
+	/* audio_stream_wrap() is needed here and it is just below in a loop */
+	ptr = (int16_t *)stream->w_ptr + mixed_frames * stream->channels + channel_index;
+
+	for (left_frames = frame_count; left_frames; left_frames -= frames) {
+		ptr = audio_stream_wrap(stream, ptr);
+		n = audio_stream_samples_without_wrap_s16(stream, ptr);
+		samples = left_frames * channel_count;
+		n = MIN(samples, n);
+		frames = 0;
+		for (i = 0; i < n; i += channel_count) {
+			*ptr = 0;
+			ptr += channel_count;
+			frames++;
+		}
+	}
+}
+#endif	/* CONFIG_FORMAT_S16LE */
+
+#if CONFIG_FORMAT_S24LE
+/* Instead of using sink->channels and source->channels, sink_channel_count and
+ * source_channel_count are supplied as parameters. This is done to reuse the function
+ * to also mix an entire stream. In this case the function is called with fake stream
+ * parameters: multichannel stream is treated as single channel and so the entire stream
+ * contents is mixed.
+ */
+static void normal_mix_channel_s24(struct audio_stream __sparse_cache *sink, int32_t start_frame,
+				   int32_t mixed_frames,
+				   const struct audio_stream __sparse_cache *source,
+				   int32_t frame_count, uint16_t gain)
+{
+	int32_t frames_to_mix, frames_to_copy, left_frames;
+	int32_t n, nmax, i;
+	/* audio_stream_wrap() is required and is done below in a loop */
+	int32_t *dst = (int32_t *)sink->w_ptr + start_frame;
+	int32_t *src = (int32_t *)source->r_ptr;
+
+	assert(mixed_frames >= start_frame);
+	frames_to_mix = mixed_frames - start_frame;
+	frames_to_mix = MIN(frames_to_mix, frame_count);
+	frames_to_copy = frame_count - frames_to_mix;
+
+	for (left_frames = frames_to_mix; left_frames > 0; left_frames -= n) {
+		src = audio_stream_wrap(source, src);
+		dst = audio_stream_wrap(sink, dst);
+		/* calculate the remaining samples*/
+		nmax = audio_stream_samples_without_wrap_s24(source, src);
+		n = MIN(left_frames, nmax);
+		nmax = audio_stream_samples_without_wrap_s24(sink, dst);
+		n = MIN(n, nmax);
+		for (i = 0; i < n; i++) {
+			*dst = sat_int24(sign_extend_s24(*dst) + sign_extend_s24(*src++));
+			dst++;
+		}
+	}
+
+	for (left_frames = frames_to_copy; left_frames > 0; left_frames -= n) {
+		src = audio_stream_wrap(source, src);
+		dst = audio_stream_wrap(sink, dst);
+		nmax = audio_stream_samples_without_wrap_s24(source, src);
+		n = MIN(left_frames, nmax);
+		nmax = audio_stream_samples_without_wrap_s24(sink, dst);
+		n = MIN(n, nmax);
+		memcpy_s(dst, n * sizeof(int32_t), src, n * sizeof(int32_t));
+	}
+}
+
+static void remap_mix_channel_s24(struct audio_stream __sparse_cache *sink,
+				  int32_t sink_channel_index,  int32_t sink_channel_count,
+				  int32_t start_frame, int32_t mixed_frames,
+				  const struct audio_stream __sparse_cache *source,
+				  int32_t source_channel_index, int32_t source_channel_count,
+				  int32_t frame_count, uint16_t gain)
+{
+	int32_t *dst, *src;
+	int32_t frames_to_mix, frames_to_copy, left_frames;
+	int32_t n, nmax, i, frames, samples;
+
+	/* audio_stream_wrap() is required and is done below in a loop */
+	dst = (int32_t *)sink->w_ptr + start_frame * sink_channel_count + sink_channel_index;
+	src = (int32_t *)source->r_ptr + source_channel_index;
+
+	assert(mixed_frames >= start_frame);
+	frames_to_mix = mixed_frames - start_frame;
+	frames_to_mix = MIN(frames_to_mix, frame_count);
+	frames_to_copy = frame_count - frames_to_mix;
+
+	for (left_frames = frames_to_mix; left_frames > 0; left_frames -= frames) {
+		src = audio_stream_wrap(source, src);
+		dst = audio_stream_wrap(sink, dst);
+		/* calculate the remaining samples*/
+		nmax = audio_stream_samples_without_wrap_s24(source, src);
+		samples = left_frames * source_channel_count;
+		n = MIN(samples, nmax);
+		nmax = audio_stream_samples_without_wrap_s24(sink, dst);
+		n = MIN(n, nmax);
+		/* frames is the processed frame count in this loop*/
+		frames = 0;
+		for (i = 0; i < n; i += source_channel_count) {
+			*dst = sat_int24(sign_extend_s24(*dst) +
+					  (int32_t)q_mults_32x32(sign_extend_s24(*src),
+					  gain, IPC4_MIXIN_GAIN_SHIFT));
+			src += source_channel_count;
+			dst += sink_channel_count;
+			frames++;
+		}
+	}
+
+	for (left_frames = frames_to_copy; left_frames > 0; left_frames -= frames) {
+		src = audio_stream_wrap(source, src);
+		dst = audio_stream_wrap(sink, dst);
+		nmax = audio_stream_samples_without_wrap_s24(source, src);
+		samples = left_frames * source_channel_count;
+		n = MIN(samples, nmax);
+		nmax = audio_stream_samples_without_wrap_s24(sink, dst);
+		n = MIN(n, nmax);
+		frames = 0;
+		for (i = 0; i < n; i += source_channel_count) {
+			*dst = (int32_t)q_mults_32x32(sign_extend_s24(*src),
+							   gain, IPC4_MIXIN_GAIN_SHIFT);
+			src += source_channel_count;
+			dst += sink_channel_count;
+			frames++;
+		}
+	}
+}
+
+#endif	/* CONFIG_FORMAT_S24LE */
+
+#if CONFIG_FORMAT_S32LE
+/* Instead of using sink->channels and source->channels, sink_channel_count and
+ * source_channel_count are supplied as parameters. This is done to reuse the function
+ * to also mix an entire stream. In this case the function is called with fake stream
+ * parameters: multichannel stream is treated as single channel and so the entire stream
+ * contents is mixed.
+ */
+static void normal_mix_channel_s32(struct audio_stream __sparse_cache *sink, int32_t start_frame,
+				   int32_t mixed_frames,
+				   const struct audio_stream __sparse_cache *source,
+				   int32_t frame_count, uint16_t gain)
+{
+	int32_t frames_to_mix, frames_to_copy, left_frames;
+	int32_t n, nmax, i;
+	int32_t *dst = (int32_t *)sink->w_ptr + start_frame;
+	int32_t *src = (int32_t *)source->r_ptr;
+
+	assert(mixed_frames >= start_frame);
+	frames_to_mix = mixed_frames - start_frame;
+	frames_to_mix = MIN(frames_to_mix, frame_count);
+	frames_to_copy = frame_count - frames_to_mix;
+
+	for (left_frames = frames_to_mix; left_frames > 0; left_frames -= n) {
+		src = audio_stream_wrap(source, src);
+		dst = audio_stream_wrap(sink, dst);
+		/* calculate the remaining samples*/
+		nmax = audio_stream_samples_without_wrap_s32(source, src);
+		n = MIN(left_frames, nmax);
+		nmax = audio_stream_samples_without_wrap_s32(sink, dst);
+		n = MIN(n, nmax);
+		for (i = 0; i < n; i++) {
+			*dst = sat_int32((int64_t)*dst + (int64_t)*src++);
+			dst++;
+		}
+	}
+
+	for (left_frames = frames_to_copy; left_frames > 0; left_frames -= n) {
+		src = audio_stream_wrap(source, src);
+		dst = audio_stream_wrap(sink, dst);
+		nmax = audio_stream_samples_without_wrap_s32(source, src);
+		n = MIN(left_frames, nmax);
+		nmax = audio_stream_samples_without_wrap_s32(sink, dst);
+		n = MIN(n, nmax);
+		memcpy_s(dst, n * sizeof(int32_t), src, n * sizeof(int32_t));
+	}
+}
+
+static void remap_mix_channel_s32(struct audio_stream __sparse_cache *sink,
+				  int32_t sink_channel_index,  int32_t sink_channel_count,
+				  int32_t start_frame, int32_t mixed_frames,
+				  const struct audio_stream __sparse_cache *source,
+				  int32_t source_channel_index, int32_t source_channel_count,
+				  int32_t frame_count, uint16_t gain)
+{
+	int32_t frames_to_mix, frames_to_copy, left_frames;
+	int32_t n, nmax, frames, i, samples;
+	int32_t *dst, *src;
+
+	/* audio_stream_wrap() is required and is done below in a loop */
+	dst = (int32_t *)sink->w_ptr + start_frame * sink_channel_count + sink_channel_index;
+	src = (int32_t *)source->r_ptr + source_channel_index;
+
+	assert(mixed_frames >= start_frame);
+	frames_to_mix = mixed_frames - start_frame;
+	frames_to_mix = MIN(frames_to_mix, frame_count);
+	frames_to_copy = frame_count - frames_to_mix;
+
+	for (left_frames = frames_to_mix; left_frames > 0; left_frames -= frames) {
+		src = audio_stream_wrap(source, src);
+		dst = audio_stream_wrap(sink, dst);
+		/* calculate the remaining samples*/
+		nmax = audio_stream_samples_without_wrap_s32(source, src);
+		samples = left_frames * source_channel_count;
+		n = MIN(samples, nmax);
+		nmax = audio_stream_samples_without_wrap_s32(sink, dst);
+		n = MIN(n, nmax);
+		/* frames is the processed frame count in this loop*/
+		frames = 0;
+		for (i = 0; i < n; i += source_channel_count) {
+			*dst = sat_int32((int64_t)*dst +
+					  q_mults_32x32(*src, gain, IPC4_MIXIN_GAIN_SHIFT));
+			src += source_channel_count;
+			dst += sink_channel_count;
+			frames++;
+		}
+	}
+
+	for (left_frames = frames_to_copy; left_frames > 0; left_frames -= frames) {
+		src = audio_stream_wrap(source, src);
+		dst = audio_stream_wrap(sink, dst);
+		nmax = audio_stream_samples_without_wrap_s32(source, src);
+		samples = left_frames * source_channel_count;
+		n = MIN(samples, nmax);
+		nmax = audio_stream_samples_without_wrap_s32(sink, dst);
+		n = MIN(n, nmax);
+		frames = 0;
+		for (i = 0; i < n; i += source_channel_count) {
+			*dst = (int32_t)q_mults_32x32(*src, gain, IPC4_MIXIN_GAIN_SHIFT);
+			src += source_channel_count;
+			dst += sink_channel_count;
+			frames++;
+		}
+	}
+}
+
+#endif	/* CONFIG_FORMAT_S32LE */
+
+#if CONFIG_FORMAT_S32LE || CONFIG_FORMAT_S24LE
+static void mute_channel_s32(struct audio_stream __sparse_cache *stream, int32_t channel_index,
+			     int32_t start_frame, int32_t mixed_frames, int32_t frame_count)
+{
+	int32_t skip_mixed_frames, left_frames, n, channel_count, i, frames, samples;
+	int32_t *ptr;
+
+	assert(mixed_frames >= start_frame);
+	skip_mixed_frames = mixed_frames - start_frame;
+
+	if (frame_count <= skip_mixed_frames)
+		return;
+	frame_count -= skip_mixed_frames;
+	channel_count = stream->channels;
+
+	ptr = (int32_t *)stream->w_ptr + mixed_frames * stream->channels + channel_index;
+
+	for (left_frames = frame_count; left_frames > 0; left_frames -= frames) {
+		ptr = audio_stream_wrap(stream, ptr);
+		n = audio_stream_samples_without_wrap_s32(stream, ptr);
+		samples = left_frames * channel_count;
+		n =  MIN(samples, n);
+		frames = 0;
+		for (i = 0; i < n; i += channel_count) {
+			*ptr = 0;
+			ptr += channel_count;
+			frames++;
+		}
+	}
+}
+
+#endif
+
+const struct mix_func_map mix_func_map[] = {
+#if CONFIG_FORMAT_S16LE
+	{ SOF_IPC_FRAME_S16_LE, normal_mix_channel_s16, remap_mix_channel_s16, mute_channel_s16},
+#endif
+#if CONFIG_FORMAT_S24LE
+	{ SOF_IPC_FRAME_S24_4LE, normal_mix_channel_s24, remap_mix_channel_s24, mute_channel_s32},
+#endif
+#if CONFIG_FORMAT_S32LE
+	{ SOF_IPC_FRAME_S32_LE, normal_mix_channel_s32, remap_mix_channel_s32, mute_channel_s32}
+#endif
+};
+
+const size_t mix_count = ARRAY_SIZE(mix_func_map);
+
+#endif
--- a/src/audio/mixin_mixout/mixin_mixout_hifi3.c
+++ b/src/audio/mixin_mixout/mixin_mixout_hifi3.c
@ -0,0 +1,589 @@
+// SPDX-License-Identifier: BSD-3-Clause
+//
+// Copyright(c) 2022 Intel Corporation. All rights reserved.
+//
+// Author: Andrula Song <xiaoyuan.song@intel.com>
+
+#include <ipc4/mixin_mixout.h>
+#include <sof/common.h>
+
+#ifdef MIXIN_MIXOUT_HIFI3
+
+#if CONFIG_FORMAT_S16LE
+/* Instead of using sink->channels and source->channels, sink_channel_count and
+ * source_channel_count are supplied as parameters. This is done to reuse the function
+ * to also mix an entire stream. In this case the function is called with fake stream
+ * parameters: multichannel stream is treated as single channel and so the entire stream
+ * contents is mixed.
+ */
+static void normal_mix_channel_s16(struct audio_stream __sparse_cache *sink, int32_t start_frame,
+				   int32_t mixed_frames,
+				   const struct audio_stream __sparse_cache *source,
+				   int32_t frame_count, uint16_t gain)
+{
+	int frames_to_mix, frames_to_copy, left_frames;
+	int n, nmax, i, m, left;
+	ae_int16x4 in_sample;
+	ae_int16x4 out_sample;
+	ae_int16x4 *in;
+	ae_int16x4 *out;
+	ae_valign inu = AE_ZALIGN64();
+	ae_valign outu1 = AE_ZALIGN64();
+	ae_valign outu2 = AE_ZALIGN64();
+	/* audio_stream_wrap() is required and is done below in a loop */
+	ae_int16 *dst = (ae_int16 *)sink->w_ptr + start_frame;
+	ae_int16 *src = (ae_int16 *)source->r_ptr;
+
+	assert(mixed_frames >= start_frame);
+	frames_to_mix = AE_MIN_32_signed(mixed_frames - start_frame, frame_count);
+	frames_to_copy = frame_count - frames_to_mix;
+	n = 0;
+
+	for (left_frames = frames_to_mix; left_frames > 0; left_frames -= n) {
+		src = audio_stream_wrap(source, src + n);
+		dst = audio_stream_wrap(sink, dst + n);
+		/* calculate the remaining samples*/
+		nmax = audio_stream_samples_without_wrap_s16(source, src);
+		n = AE_MIN_32_signed(left_frames, nmax);
+		nmax = audio_stream_samples_without_wrap_s16(sink, dst);
+		n = AE_MIN_32_signed(n, nmax);
+		in = (ae_int16x4 *)src;
+		out = (ae_int16x4 *)dst;
+		inu = AE_LA64_PP(in);
+		outu1 = AE_LA64_PP(out);
+		m = n >> 2;
+		left = n & 0x03;
+		/* process 4 frames per loop */
+		for (i = 0; i < m; i++) {
+			AE_LA16X4_IP(in_sample, inu, in);
+			AE_LA16X4_IP(out_sample, outu1, out);
+			out--;
+			out_sample = AE_ADD16S(in_sample, out_sample);
+			AE_SA16X4_IP(out_sample, outu2, out);
+		}
+		AE_SA64POS_FP(outu2, out);
+
+		/* process the left samples that less than 4
+		 * one by one to avoid memory access overrun
+		 */
+		for (i = 0; i < left ; i++) {
+			AE_L16_IP(in_sample, (ae_int16 *)in, sizeof(ae_int16));
+			AE_L16_IP(out_sample, (ae_int16 *)out, 0);
+			out_sample = AE_ADD16S(in_sample, out_sample);
+			AE_S16_0_IP(out_sample, (ae_int16 *)out, sizeof(ae_int16));
+		}
+	}
+
+	for (left_frames = frames_to_copy; left_frames > 0; left_frames -= n) {
+		src = audio_stream_wrap(source, src + n);
+		dst = audio_stream_wrap(sink, dst + n);
+		/* calculate the remaining samples*/
+		nmax = audio_stream_samples_without_wrap_s16(source, src);
+		n = AE_MIN_32_signed(left_frames, nmax);
+		nmax = audio_stream_samples_without_wrap_s16(sink, dst);
+		n = AE_MIN_32_signed(n, nmax);
+		in = (ae_int16x4 *)src;
+		out = (ae_int16x4 *)dst;
+		inu = AE_LA64_PP(in);
+		m = n >> 2;
+		left = n & 0x03;
+		/* process 4 frames per loop */
+		for (i = 0; i < m; i++) {
+			AE_LA16X4_IP(in_sample, inu, in);
+			AE_SA16X4_IP(in_sample, outu2, out);
+		}
+		AE_SA64POS_FP(outu2, out);
+
+		/* process the left samples that less than 4
+		 * one by one to avoid memory access overrun
+		 */
+		for (i = 0; i < left ; i++) {
+			AE_L16_IP(in_sample, (ae_int16 *)in, sizeof(ae_int16));
+			AE_S16_0_IP(in_sample, (ae_int16 *)out, sizeof(ae_int16));
+		}
+	}
+}
+
+static void remap_mix_channel_s16(struct audio_stream __sparse_cache *sink,
+				  int32_t sink_channel_index,  int32_t sink_channel_count,
+				  int32_t start_frame, int32_t mixed_frames,
+				  const struct audio_stream __sparse_cache *source,
+				  int32_t source_channel_index, int32_t source_channel_count,
+				  int32_t frame_count, uint16_t gain)
+{
+	ae_int16 *dst, *src;
+	int frames_to_mix, frames_to_copy, left_frames;
+	int n, nmax, frames, i;
+	int inoff = source_channel_count * sizeof(ae_int16);
+	int outoff = sink_channel_count * sizeof(ae_int16);
+	ae_int16x4 in;
+	ae_int16x4 out;
+	ae_int16 *pgain = (ae_int16 *)&gain;
+	ae_int16x4 gain_v;
+	ae_int32x2 temp, out1;
+
+	dst = (ae_int16 *)sink->w_ptr + start_frame * sink_channel_count + sink_channel_index;
+	src = (ae_int16 *)source->r_ptr + source_channel_index;
+	src = audio_stream_wrap(source, src);
+
+	assert(mixed_frames >= start_frame);
+	frames_to_mix = AE_MIN_32_signed(mixed_frames - start_frame, frame_count);
+	frames_to_copy = frame_count - frames_to_mix;
+
+	/* store gain to a AE_DR register gain_v*/
+	AE_L16_IP(gain_v, pgain, 0);
+
+	/* set source as circular buffer, hifi3 only have 1 circular buffer*/
+	AE_SETCBEGIN0(source->addr);
+	AE_SETCEND0(source->end_addr);
+
+	for (left_frames = frames_to_mix; left_frames > 0; left_frames -= frames) {
+		/* audio_stream_wrap() is required and is done below in a loop */
+		dst = audio_stream_wrap(sink, dst);
+
+		/* calculate the remaining samples of sink*/
+		nmax = audio_stream_samples_without_wrap_s16(sink, dst);
+		n = AE_MIN_32_signed(left_frames * sink_channel_count, nmax);
+
+		/* frames is the processed frame count in this loop*/
+		frames = 0;
+		for (i = 0; i < n; i += source_channel_count) {
+			AE_L16_XC(in, src, inoff);
+			AE_L16_XP(out, dst, 0);
+			/* Q1.15 * Q1.15 to Q2.30*/
+			temp = AE_MULF16SS_00(in, gain_v);
+			temp = AE_SRAI32R(temp, IPC4_MIXIN_GAIN_SHIFT + 1);
+			out1 = AE_SEXT32X2D16_10(out);
+			temp = AE_ADD32S(temp, out1);
+			temp = AE_SLAI32S(temp, 16);
+			out = AE_ROUND16X4F32SSYM(temp, temp);
+			AE_S16_0_XP(out, dst, outoff);
+			frames++;
+		}
+	}
+
+	for (left_frames = frames_to_copy; left_frames > 0; left_frames -= frames) {
+		dst = audio_stream_wrap(sink, dst);
+		nmax = audio_stream_samples_without_wrap_s16(sink, dst);
+		n = AE_MIN_32_signed(left_frames * sink_channel_count, nmax);
+		frames = 0;
+
+		for (i = 0; i < n; i += source_channel_count) {
+			AE_L16_XC(in, src, inoff);
+			AE_L16_XP(out, dst, 0);
+			temp = AE_MULF16SS_00(in, gain_v);
+			temp = AE_SRAI32R(temp, IPC4_MIXIN_GAIN_SHIFT + 1);
+			temp = AE_SLAI32S(temp, 16);
+			out = AE_ROUND16X4F32SSYM(temp, temp);
+			AE_S16_0_XP(out, dst, outoff);
+			frames++;
+		}
+	}
+}
+
+static void mute_channel_s16(struct audio_stream __sparse_cache *stream, int32_t channel_index,
+			     int32_t start_frame, int32_t mixed_frames, int32_t frame_count)
+{
+	int skip_mixed_frames, left_frames;
+	int off = stream->channels * sizeof(ae_int16);
+	ae_int16 *ptr;
+	ae_int16x4 zero = AE_ZERO16();
+
+	assert(mixed_frames >= start_frame);
+	skip_mixed_frames = mixed_frames - start_frame;
+
+	if (frame_count <= skip_mixed_frames)
+		return;
+	frame_count -= skip_mixed_frames;
+
+	AE_SETCBEGIN0(stream->addr);
+	AE_SETCEND0(stream->end_addr);
+
+	/* audio_stream_wrap() is needed here and it is just below in a loop */
+	ptr = (ae_int16 *)stream->w_ptr + mixed_frames * stream->channels + channel_index;
+	ptr = audio_stream_wrap(stream, ptr);
+
+	for (left_frames = frame_count ; left_frames; left_frames--)
+		AE_S16_0_XC(zero, ptr, off);
+}
+#endif	/* CONFIG_FORMAT_S16LE */
+
+#if CONFIG_FORMAT_S24LE
+/* Instead of using sink->channels and source->channels, sink_channel_count and
+ * source_channel_count are supplied as parameters. This is done to reuse the function
+ * to also mix an entire stream. In this case the function is called with fake stream
+ * parameters: multichannel stream is treated as single channel and so the entire stream
+ * contents is mixed.
+ */
+static void normal_mix_channel_s24(struct audio_stream __sparse_cache *sink, int32_t start_frame,
+				   int32_t mixed_frames,
+				   const struct audio_stream __sparse_cache *source,
+				   int32_t frame_count, uint16_t gain)
+{
+	int frames_to_mix, frames_to_copy, left_frames;
+	int n, nmax, i, m, left;
+	ae_int32x2 in_sample;
+	ae_int32x2 out_sample;
+	ae_int32x2 *in;
+	ae_int32x2 *out;
+	ae_valign inu = AE_ZALIGN64();
+	ae_valign outu1 = AE_ZALIGN64();
+	ae_valign outu2 = AE_ZALIGN64();
+	/* audio_stream_wrap() is required and is done below in a loop */
+	int32_t *dst = (int32_t *)sink->w_ptr + start_frame;
+	int32_t *src = (int32_t *)source->r_ptr;
+
+	assert(mixed_frames >= start_frame);
+	frames_to_mix = AE_MIN_32_signed(mixed_frames - start_frame, frame_count);
+	frames_to_copy = frame_count - frames_to_mix;
+	n = 0;
+
+	for (left_frames = frames_to_mix; left_frames > 0; left_frames -= n) {
+		src = audio_stream_wrap(source, src + n);
+		dst = audio_stream_wrap(sink, dst + n);
+		/* calculate the remaining samples*/
+		nmax = audio_stream_samples_without_wrap_s24(source, src);
+		n = AE_MIN_32_signed(left_frames, nmax);
+		nmax = audio_stream_samples_without_wrap_s24(sink, dst);
+		n = AE_MIN_32_signed(n, nmax);
+		in = (ae_int32x2 *)src;
+		out = (ae_int32x2 *)dst;
+		inu = AE_LA64_PP(in);
+		outu1 = AE_LA64_PP(out);
+		m = n >> 1;
+		left = n & 1;
+		/* process 2 samples per time */
+		for (i = 0; i < m; i++) {
+			AE_LA32X2_IP(in_sample, inu, in);
+			AE_LA32X2_IP(out_sample, outu1, out);
+			out--;
+			out_sample = AE_ADD24S(in_sample, out_sample);
+			AE_SA32X2_IP(out_sample, outu2, out);
+		}
+		AE_SA64POS_FP(outu2, out);
+
+		/* process the left sample to avoid memory access overrun */
+		if (left) {
+			AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32));
+			AE_L32_IP(out_sample, (ae_int32 *)out, 0);
+			out_sample = AE_ADD24S(in_sample, out_sample);
+			AE_S32_L_IP(out_sample, (ae_int32 *)out, sizeof(ae_int32));
+		}
+	}
+
+	for (left_frames = frames_to_copy; left_frames > 0; left_frames -= n) {
+		src = audio_stream_wrap(source, src + n);
+		dst = audio_stream_wrap(sink, dst + n);
+		nmax = audio_stream_samples_without_wrap_s24(source, src);
+		n = AE_MIN_32_signed(left_frames, nmax);
+		nmax = audio_stream_samples_without_wrap_s24(sink, dst);
+		n = AE_MIN_32_signed(n, nmax);
+		in = (ae_int32x2 *)src;
+		out = (ae_int32x2 *)dst;
+		inu = AE_LA64_PP(in);
+		m = n >> 1;
+		left = n & 1;
+		for (i = 0; i < m; i++) {
+			AE_LA32X2_IP(in_sample, inu, in);
+			AE_SA32X2_IP(in_sample, outu2, out);
+		}
+		AE_SA64POS_FP(outu2, out);
+		/* process the left sample to avoid memory access overrun */
+		if (left) {
+			AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32));
+			AE_S32_L_IP(in_sample, (ae_int32 *)out, sizeof(ae_int32));
+		}
+	}
+}
+
+static void remap_mix_channel_s24(struct audio_stream __sparse_cache *sink,
+				  int32_t sink_channel_index,  int32_t sink_channel_count,
+				  int32_t start_frame, int32_t mixed_frames,
+				  const struct audio_stream __sparse_cache *source,
+				  int32_t source_channel_index, int32_t source_channel_count,
+				  int32_t frame_count, uint16_t gain)
+{
+	int frames_to_mix, frames_to_copy, left_frames;
+	int n, nmax, i, frames;
+	int inoff = source_channel_count * sizeof(ae_int32);
+	int outoff = sink_channel_count * sizeof(ae_int32);
+	ae_int32x2 in;
+	ae_int32x2 out;
+	ae_int64 tmp;
+	ae_int16 *pgain = (ae_int16 *)&gain;
+	ae_int16x4 gain_v;
+	ae_int32 *dst, *src;
+
+	dst = (ae_int32 *)sink->w_ptr + start_frame * sink_channel_count + sink_channel_index;
+	src = (ae_int32 *)source->r_ptr + source_channel_index;
+	src = audio_stream_wrap(source, src);
+	assert(mixed_frames >= start_frame);
+	frames_to_mix = AE_MIN_32_signed(mixed_frames - start_frame, frame_count);
+	frames_to_copy = frame_count - frames_to_mix;
+
+	/* store gain to a AE_DR register gain_v*/
+	AE_L16_IP(gain_v, pgain, 0);
+	/* set source as circular buffer, hifi3 only have 1 circular buffer*/
+	AE_SETCBEGIN0(source->addr);
+	AE_SETCEND0(source->end_addr);
+
+	for (left_frames = frames_to_mix; left_frames > 0; left_frames -= frames) {
+		dst = audio_stream_wrap(sink, dst);
+		/* calculate the remaining samples*/
+		nmax = audio_stream_samples_without_wrap_s24(sink, dst);
+		n = AE_MIN_32_signed(left_frames * sink_channel_count, nmax);
+		/* frames is the processed frame count in this loop*/
+		frames = 0;
+
+		for (i = 0; i < n; i += source_channel_count) {
+			AE_L32_XC(in, src, inoff);
+			/*shift the significant 8 bits to the left*/
+			in = AE_SLAI32(in, 8);
+			AE_L32_XP(out, dst, 0);
+			out = AE_SLAI32(out, 8);
+			out = AE_SRAI32(out, 8);
+			tmp = AE_MUL32X16_H0(in, gain_v);
+
+			/* shift should be IPC4_MIXIN_GAIN_SHIFT + 8(shift right for in)
+			 * - 16(to keep the valid LSB bits of ae_int64)
+			 */
+			in = AE_ROUND32F48SSYM(AE_SRAI64(tmp, IPC4_MIXIN_GAIN_SHIFT - 8));
+			out = AE_ADD32S(out, in);
+			out = AE_SLAI32S(out, 8);
+			out = AE_SRAI32(out, 8);
+			AE_S32_L_XP(out, dst, outoff);
+			frames++;
+		}
+	}
+
+	for (left_frames = frames_to_copy; left_frames > 0; left_frames -= frames) {
+		dst = audio_stream_wrap(sink, dst);
+		nmax = audio_stream_samples_without_wrap_s24(sink, dst);
+		n = AE_MIN_32_signed(left_frames * sink_channel_count, nmax);
+		frames = 0;
+
+		for (i = 0; i < n; i += source_channel_count) {
+			AE_L32_XC(in, src, inoff);
+			/*shift the significant bit to the left*/
+			in = AE_SLAI32(in, 8);
+			tmp = AE_MUL32X16_H0(in, gain_v);
+			out = AE_ROUND32F48SSYM(AE_SRAI64(tmp, IPC4_MIXIN_GAIN_SHIFT - 8));
+			out = AE_SLAI32S(out, 8);
+			out = AE_SRAI32(out, 8);
+			AE_S32_L_XP(out, dst, outoff);
+			frames++;
+		}
+	}
+}
+
+#endif	/* CONFIG_FORMAT_S24LE */
+
+#if CONFIG_FORMAT_S32LE
+/* Instead of using sink->channels and source->channels, sink_channel_count and
+ * source_channel_count are supplied as parameters. This is done to reuse the function
+ * to also mix an entire stream. In this case the function is called with fake stream
+ * parameters: multichannel stream is treated as single channel and so the entire stream
+ * contents is mixed.
+ */
+static void normal_mix_channel_s32(struct audio_stream __sparse_cache *sink, int32_t start_frame,
+				   int32_t mixed_frames,
+				   const struct audio_stream __sparse_cache *source,
+				   int32_t frame_count, uint16_t gain)
+{
+	int frames_to_mix, frames_to_copy, left_frames;
+	int n, nmax, i, m, left;
+	ae_int32x2 in_sample;
+	ae_int32x2 out_sample;
+	ae_int32x2 *in;
+	ae_int32x2 *out;
+	ae_valign inu = AE_ZALIGN64();
+	ae_valign outu1 = AE_ZALIGN64();
+	ae_valign outu2 = AE_ZALIGN64();
+	/* audio_stream_wrap() is required and is done below in a loop */
+	int32_t *dst = (int32_t *)sink->w_ptr + start_frame;
+	int32_t *src = (int32_t *)source->r_ptr;
+
+	assert(mixed_frames >= start_frame);
+	frames_to_mix = AE_MIN_32_signed(mixed_frames - start_frame, frame_count);
+	frames_to_copy = frame_count - frames_to_mix;
+	n = 0;
+
+	for (left_frames = frames_to_mix; left_frames > 0; left_frames -= n) {
+		src = audio_stream_wrap(source, src + n);
+		dst = audio_stream_wrap(sink, dst + n);
+		/* calculate the remaining samples*/
+		nmax = audio_stream_samples_without_wrap_s32(source, src);
+		n = AE_MIN_32_signed(left_frames, nmax);
+		nmax = audio_stream_samples_without_wrap_s32(sink, dst);
+		n = AE_MIN_32_signed(n, nmax);
+		in = (ae_int32x2 *)src;
+		out = (ae_int32x2 *)dst;
+		inu = AE_LA64_PP(in);
+		outu1 = AE_LA64_PP(out);
+		m = n >> 1;
+		left = n & 1;
+		for (i = 0; i < m; i++) {
+			AE_LA32X2_IP(in_sample, inu, in);
+			AE_LA32X2_IP(out_sample, outu1, out);
+			out--;
+			out_sample = AE_ADD32S(in_sample, out_sample);
+			AE_SA32X2_IP(out_sample, outu2, out);
+		}
+		AE_SA64POS_FP(outu2, out);
+
+		/* process the left sample to avoid memory access overrun */
+		if (left) {
+			AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32));
+			AE_L32_IP(out_sample, (ae_int32 *)out, 0);
+			out_sample = AE_ADD32S(in_sample, out_sample);
+			AE_S32_L_IP(out_sample, (ae_int32 *)out, sizeof(ae_int32));
+		}
+	}
+
+	for (left_frames = frames_to_copy; left_frames > 0; left_frames -= n) {
+		src = audio_stream_wrap(source, src + n);
+		dst = audio_stream_wrap(sink, dst + n);
+		/* calculate the remaining samples*/
+		nmax = audio_stream_samples_without_wrap_s32(source, src);
+		n = AE_MIN_32_signed(left_frames, nmax);
+		nmax = audio_stream_samples_without_wrap_s32(sink, dst);
+		n = AE_MIN_32_signed(n, nmax);
+		in = (ae_int32x2 *)src;
+		out = (ae_int32x2 *)dst;
+		inu = AE_LA64_PP(in);
+		m = n >> 1;
+		left = n & 1;
+		for (i = 0; i < m; i++) {
+			AE_LA32X2_IP(in_sample, inu, in);
+			AE_SA32X2_IP(in_sample, outu2, out);
+		}
+		AE_SA64POS_FP(outu2, out);
+
+		/* process the left sample to avoid memory access overrun */
+		if (left) {
+			AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32));
+			AE_S32_L_IP(in_sample, (ae_int32 *)out, sizeof(ae_int32));
+		}
+	}
+}
+
+static void remap_mix_channel_s32(struct audio_stream __sparse_cache *sink,
+				  int32_t sink_channel_index,  int32_t sink_channel_count,
+				  int32_t start_frame, int32_t mixed_frames,
+				  const struct audio_stream __sparse_cache *source,
+				  int32_t source_channel_index, int32_t source_channel_count,
+				  int32_t frame_count, uint16_t gain)
+{
+	int inoff = source_channel_count * sizeof(ae_int32);
+	int outoff = sink_channel_count * sizeof(ae_int32);
+	ae_int32x2 in;
+	ae_int32x2 out;
+	ae_int64 tmp, tmp1;
+	ae_int16 *pgain = (ae_int16 *)&gain;
+	ae_int16x4 gain_v;
+	int32_t frames_to_mix, frames_to_copy, left_frames;
+	int32_t n, nmax, frames, i;
+	ae_int32 *dst, *src;
+
+	/* audio_stream_wrap() is required and is done below in a loop */
+	dst = (ae_int32 *)sink->w_ptr + start_frame * sink_channel_count + sink_channel_index;
+	src = (ae_int32 *)source->r_ptr + source_channel_index;
+
+	assert(mixed_frames >= start_frame);
+	frames_to_mix = AE_MIN_32_signed(mixed_frames - start_frame, frame_count);
+	frames_to_copy = frame_count - frames_to_mix;
+	/* store gain to a AE_DR register gain_v*/
+	AE_L16_IP(gain_v, pgain, 0);
+	/* set source as circular buffer, hifi3 only have 1 circular buffer*/
+	AE_SETCBEGIN0(source->addr);
+	AE_SETCEND0(source->end_addr);
+	src = audio_stream_wrap(source, src);
+
+	for (left_frames = frames_to_mix; left_frames > 0; left_frames -= frames) {
+		dst = audio_stream_wrap(sink, dst);
+		nmax = audio_stream_samples_without_wrap_s32(sink, dst);
+		n = AE_MIN_32_signed(left_frames * sink_channel_count, nmax);
+		/* frames is the processed frame count in this loop*/
+		frames = 0;
+
+		for (i = 0; i < n; i += source_channel_count) {
+			AE_L32_XC(in, src, inoff);
+			AE_L32_XP(out, dst, 0);
+			tmp = AE_MUL32X16_H0(in, gain_v);
+
+			/* shift should be -(IPC4_MIXIN_GAIN_SHIFT
+			 * - 16(to keep the valid LSB bits of ae_int64))
+			 */
+			tmp = AE_SLAI64S(tmp, 16 - IPC4_MIXIN_GAIN_SHIFT);
+			tmp1 = AE_CVT48A32(out);
+			tmp = AE_ADD64S(tmp, tmp1);
+			out = AE_ROUND32F48SSYM(tmp);
+			AE_S32_L_XP(out, dst, outoff);
+			frames++;
+		}
+	}
+
+	for (left_frames = frames_to_copy; left_frames > 0; left_frames -= frames) {
+		dst = audio_stream_wrap(sink, dst);
+		nmax = audio_stream_samples_without_wrap_s32(sink, dst);
+		n = AE_MIN_32_signed(left_frames * sink_channel_count, nmax);
+		/* frames is the processed frame count in this loop*/
+		frames = 0;
+
+		for (i = 0; i < n; i += source_channel_count) {
+			AE_L32_XC(in, src, inoff);
+			tmp = AE_MUL32X16_H0(in, gain_v);
+			tmp = AE_SLAI64S(tmp, 16 - IPC4_MIXIN_GAIN_SHIFT);
+			out = AE_ROUND32F48SSYM(tmp);
+			AE_S32_L_XP(out, dst, outoff);
+			frames++;
+		}
+	}
+}
+
+#endif	/* CONFIG_FORMAT_S32LE */
+
+#if CONFIG_FORMAT_S32LE || CONFIG_FORMAT_S24LE
+static void mute_channel_s32(struct audio_stream __sparse_cache *stream, int32_t channel_index,
+			     int32_t start_frame, int32_t mixed_frames, int32_t frame_count)
+{
+	int skip_mixed_frames, left_frames;
+	ae_int32 *ptr;
+	int off = stream->channels * sizeof(ae_int32);
+	ae_int32x2 zero = AE_ZERO32();
+
+	assert(mixed_frames >= start_frame);
+	skip_mixed_frames = mixed_frames - start_frame;
+
+	if (frame_count <= skip_mixed_frames)
+		return;
+	frame_count -= skip_mixed_frames;
+
+	AE_SETCBEGIN0(stream->addr);
+	AE_SETCEND0(stream->end_addr);
+
+	/* audio_stream_wrap() is needed here and it is just below in a loop */
+	ptr = (ae_int32 *)stream->w_ptr + mixed_frames * stream->channels + channel_index;
+	ptr = audio_stream_wrap(stream, ptr);
+
+	for (left_frames = frame_count ; left_frames > 0; left_frames--)
+		AE_S32_L_XC(zero, ptr, off);
+}
+
+#endif
+
+const struct mix_func_map mix_func_map[] = {
+#if CONFIG_FORMAT_S16LE
+	{ SOF_IPC_FRAME_S16_LE, normal_mix_channel_s16, remap_mix_channel_s16, mute_channel_s16},
+#endif
+#if CONFIG_FORMAT_S24LE
+	{ SOF_IPC_FRAME_S24_4LE, normal_mix_channel_s24, remap_mix_channel_s24, mute_channel_s32},
+#endif
+#if CONFIG_FORMAT_S32LE
+	{ SOF_IPC_FRAME_S32_LE, normal_mix_channel_s32, remap_mix_channel_s32, mute_channel_s32}
+#endif
+};
+
+const size_t mix_count = ARRAY_SIZE(mix_func_map);
+
+#endif
--- a/src/include/ipc4/mixin_mixout.h
+++ b/src/include/ipc4/mixin_mixout.h
@ -25,6 +25,23 @@

 #include <stdint.h>
 #include <rtos/bit.h>
+#include <sof/audio/buffer.h>
+#include <sof/audio/component.h>
+#include <sof/audio/format.h>
+#include <sof/platform.h>
+#include <stddef.h>
+
+#define MIXIN_MIXOUT_GENERIC
+
+#if defined(__XCC__)
+
+#include <xtensa/config/core-isa.h>
+#if XCHAL_HAVE_HIFI3 || XCHAL_HAVE_HIFI4
+#undef MIXIN_MIXOUT_GENERIC
+#define MIXIN_MIXOUT_HIFI3
+#endif
+
+#endif

 enum ipc4_mixin_config_param {
 	/* large_config_set param id for ipc4_mixer_mode_config */
@ -85,4 +102,92 @@ struct ipc4_mixer_mode_config {
 	struct ipc4_mixer_mode_sink_config mixer_mode_sink_configs[1];
 } __packed __aligned(4);

+/**
+ * \brief remap mode mixin_mixout processing function interface
+ */
+typedef void (*remap_mix_func)(struct audio_stream __sparse_cache *sink,
+				int32_t sink_channel_index,
+				int32_t sink_channel_count, int32_t start_frame,
+				int32_t mixed_frames,
+				const struct audio_stream __sparse_cache *source,
+				int32_t source_channel_index, int32_t source_channel_count,
+				int32_t frame_count, uint16_t gain);
+
+/**
+ * \brief normal mode mixin_mixout processing function interface
+ */
+typedef void (*normal_mix_func)(struct audio_stream __sparse_cache *sink, int32_t start_frame,
+				int32_t mixed_frames,
+				const struct audio_stream __sparse_cache *source,
+				int32_t frame_count, uint16_t gain);
+
+/**
+ * \brief mixin_mixout mute processing function interface
+ */
+typedef void (*mute_func) (struct audio_stream __sparse_cache *stream, int32_t channel_index,
+			     int32_t start_frame, int32_t mixed_frames, int32_t frame_count);
+
+/**
+ * @brief mixin_mixout processing functions map.
+ */
+struct mix_func_map {
+	uint16_t frame_fmt;				/* frame format */
+	normal_mix_func normal_func;	/* normal mode mixin_mixout processing function */
+	remap_mix_func remap_func;	/* remap mode mixin_mixout processing function */
+	mute_func mute_func;			/* mute processing function */
+};
+
+extern const struct mix_func_map mix_func_map[];
+extern const size_t mix_count;
+/**
+ * \brief Retrievies normal mode mixer processing function.
+ * \param[in] fmt  stream PCM frame format
+ */
+static inline normal_mix_func normal_mix_get_processing_function(int fmt)
+{
+	int i;
+
+	/* map the normal mode mixin_mixout function for source and sink buffers */
+	for (i = 0; i < mix_count; i++) {
+		if (fmt == mix_func_map[i].frame_fmt)
+			return mix_func_map[i].normal_func;
+	}
+
+	return NULL;
+}
+
+/**
+ * \brief Retrievies normal mode mixer processing function.
+ * \param[in] fmt  stream PCM frame format
+ */
+static inline remap_mix_func remap_mix_get_processing_function(int fmt)
+{
+	int i;
+
+	/* map the remap mode mixin_mixout function for source and sink buffers */
+	for (i = 0; i < mix_count; i++) {
+		if (fmt == mix_func_map[i].frame_fmt)
+			return mix_func_map[i].remap_func;
+	}
+
+	return NULL;
+}
+
+/**
+ * \brief Retrievies normal mode mixer processing function.
+ * \param[in] fmt  stream PCM frame format
+ */
+static inline mute_func mute_mix_get_processing_function(int fmt)
+{
+	int i;
+
+	/* map the mute function for source and sink buffers */
+	for (i = 0; i < mix_count; i++) {
+		if (fmt == mix_func_map[i].frame_fmt)
+			return mix_func_map[i].mute_func;
+	}
+
+	return NULL;
+}
+
 #endif	/* __SOF_IPC4_MIXIN_MIXOUT_H__ */
--- a/zephyr/CMakeLists.txt
+++ b/zephyr/CMakeLists.txt
@ -693,7 +693,9 @@ if(CONFIG_IPC_MAJOR_3)
 	)
 elseif(CONFIG_IPC_MAJOR_4)
 	zephyr_library_sources_ifdef(CONFIG_COMP_MIXER
-		${SOF_AUDIO_PATH}/mixin_mixout.c
+		${SOF_AUDIO_PATH}/mixin_mixout/mixin_mixout.c
+		${SOF_AUDIO_PATH}/mixin_mixout/mixin_mixout_generic.c
+		${SOF_AUDIO_PATH}/mixin_mixout/mixin_mixout_hifi3.c
 	)
 endif()
				`@ -0,0 +1 @@`
				`add_local_sources(sof mixin_mixout.c mixin_mixout_generic.c mixin_mixout_hifi3.c)`