From e841dfcc49a11bac4a1d6ab5806672b5104b036e Mon Sep 17 00:00:00 2001
From: Marc Herbert <marc.herbert@intel.com>
Date: Fri, 6 Jan 2023 22:05:25 +0000
Subject: [PATCH] src/arch/xtensa: normalize two odd files from CRLF to LF

These two generated files are the only ones stored with CRLF in git end
of lines for no reason. Probably because they were generated on Windows,
then copied to Linux before being checked into git. If they had been
added to git running on Windows then the default `autocrlf=true` would
have likely converted them to LF.

These CRLF EOLs have caused a `git hash-object` mismatch before as seen
in #5917). Now that Windows is compiling SOF, #5920 must be
reverted (see previous commit) and they are causing mismatches again.

While this normalization unfortunately causes some large git noise, that
noise can be easily filtered out with [ git ] diff -b or -w or
--ignore-space-at-eol. Github and many others also have similar
filtering options. Temporarily converting them locally with `unix2dos`
or any editor is another option.

As they are generated, comparisons should most often be performed from
the source(s) they come from anyway.

Signed-off-by: Marc Herbert <marc.herbert@intel.com>
---
 src/arch/xtensa/hal/set_region_translate.c  | 1068 +++---
 src/arch/xtensa/include/xtensa/c6x-compat.h | 3516 +++++++++----------
 2 files changed, 2292 insertions(+), 2292 deletions(-)

diff --git a/src/arch/xtensa/hal/set_region_translate.c b/src/arch/xtensa/hal/set_region_translate.c
index b1b53ed4a..27ed6b80a 100644
--- a/src/arch/xtensa/hal/set_region_translate.c
+++ b/src/arch/xtensa/hal/set_region_translate.c
@@ -1,534 +1,534 @@
-/*
- * Copyright (c) 2004-2014 Tensilica Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-#include <xtensa/config/core.h>
-
-#if XCHAL_HAVE_XEA2 && (!XCHAL_HAVE_MPU)
-/*
- * C-stubs to issue the tlb related instructions (with dsync and isync's if needed).
- *
- */
-static inline void write_dtlb_entry(unsigned vpn_way, unsigned ppn_ca) {
-	__asm__ __volatile__("wdtlb  %1, %0; dsync\n\t"
-			: : "r" (vpn_way), "r" (ppn_ca) );
-}
-
-static inline void write_itlb_entry(unsigned vpn_way, unsigned ppn_ca) {
-	__asm__ __volatile__("witlb  %1, %0; isync\n\t"
-			: : "r" (vpn_way), "r" (ppn_ca) );
-}
-
-static inline unsigned read_dtlb1_entry(unsigned addr) {
-	unsigned long tmp;
-	__asm__ __volatile__("rdtlb1  %0, %1\n\t"
-			: "=a" (tmp)
-			: "a" (addr));
-	return tmp;
-}
-
-static inline unsigned read_itlb1_entry(unsigned addr) {
-	unsigned long tmp;
-	__asm__ __volatile__("ritlb1  %0, %1\n\t"
-			: "=a" (tmp)
-			: "a" (addr));
-	return tmp;
-}
-
-static inline unsigned probe_dtlb(unsigned addr) {
-	unsigned long tmp;
-	__asm__ __volatile__("pdtlb  %0, %1\n\t"
-			: "=a" (tmp)
-			: "a" (addr));
-	return tmp;
-}
-
-static inline unsigned probe_itlb(unsigned addr) {
-	unsigned long tmp;
-	__asm__ __volatile__("pitlb  %0, %1\n\t"
-			: "=a" (tmp)
-			: "a" (addr));
-	return tmp;
-}
-
-static inline void invalidate_dtlb_entry(unsigned addr) {
-	__asm__ __volatile__("idtlb  %0; dsync \n\t"
-			: : "a" (addr));
-}
-
-static inline void invalidate_itlb_entry(unsigned addr) {
-	__asm__ __volatile__("iitlb  %0 ; isync\n\t"
-			: : "a" (addr));
-}
-
-static inline unsigned read_dtlbcfg() {
-	unsigned long tmp;
-	__asm__ __volatile__("rsr.dtlbcfg %0\n\t"
-			: "=a" (tmp));
-	return tmp;
-}
-
-static inline unsigned read_itlbcfg() {
-	unsigned long tmp;
-	__asm__ __volatile__("rsr.itlbcfg %0\n\t"
-			: "=a" (tmp));
-	return tmp;
-}
-
-#endif
-
-/*
- *  xthal_set_region_translation_raw is a quick and simple function
- *  to set both physical address <paddr> and cache attribute <cattr> for
- *  a 512MB region at <vaddr>.
- *
- *  Parameters:
- *  void* vaddr		512MB aligned pointer representing the start of virtual address region
- *  void* paddr		512MB aligned pointer representing the start of physical address region
- *  unsigned cattr	4 bit value encoding the caching properties and rights (MMU only).
- *
- *  returns 0 (XCHAL_SUCCESS) if successful
- *  returns non zero (XCHAL_UNSUPPORTED) on failure
- *
- *  This function has the following limitations:
- *
- *  1) Requires either the Region Translation Option or a v3 MMU running in the default mode (with spanning way)
- *  2) It does no error checking.
- *  3) Deals with one 512MB region (vaddr and paddr are required to be 512MB aligned although that is not explicitly checked)
- *  4) It requires the caller to do any cache flushing that is needed
- *  5) Doesn't support mnemonically setting the 'rights' (rwx, rw, ... ) bit on the MMU
- *  6) It is illegal to change the mapping of the region containing the current PC (not checked)
- *
- */
-int xthal_set_region_translation_raw(void *vaddr, void *paddr, unsigned cattr) {
-#if XCHAL_HAVE_MPU
-	return XTHAL_UNSUPPORTED;
-#else
-#if XCHAL_HAVE_XEA2
-#if XCHAL_HAVE_XLT_CACHEATTR || (XCHAL_HAVE_PTP_MMU && XCHAL_HAVE_SPANNING_WAY)
-# if XCHAL_HAVE_XLT_CACHEATTR
-	unsigned vpn_way = (unsigned)vaddr;
-# else
-	unsigned vpn_way = ((unsigned) vaddr & 0xFFFFFFF0) + XCHAL_SPANNING_WAY;
-# endif
-	unsigned ppn_ca = ((unsigned) paddr & 0xFFFFFFF0) + (cattr & 0xF);
-	write_dtlb_entry(vpn_way, ppn_ca);
-	write_itlb_entry(vpn_way, ppn_ca);
-	return XTHAL_SUCCESS;
-#else
-	return XTHAL_UNSUPPORTED;
-#endif
-#else
-	return XTHAL_UNSUPPORTED;
-#endif
-#endif
-}
-
-/*
- * xthal_v2p() takes a virtual address as input, and if that virtual address is mapped to a physical address
- * by the MMU, it returns the:
- * 		a) corresponding physical address
- * 		b) the tlb way that is used to translate the address
- * 		c) cache attribute for translation
- *
- * 	Parameters:
- * 	void* 		vaddr		A pointer representing the virtual address (there are no alignment requirements for this address)
- * 	void**		paddr		This value can be 0, or can point to a pointer variable which will be updated to contain the physical address
- * 	unsigned*	way			This value can be 0, or can point to an unsigned variable which will be updated to contain the TLB way.
- * 	unsigned*   cattr		This value can be 0, or can point to an unsigned variable which will be updated to contain the cache attr
- * 	                        For MPU configurations bits 0..3 hold the access rights and bits 4..8 hold the encoded memory type
- *
- *  Returns 	0 (XCHAL_SUCCESS) 				if successful
- * 				XTHAL_NO_MAPPING				if there is no current mapping for the virtual address
- * 				XCHAL_UNSUPPORTED            	if unsupported
- *
- * 	Limitations:
- * 					Assumes that architecture variable DVARWAY56 is "Variable"
- * 					Uses the D-TLBS for the translation ... assumption is that ITLB's have same mappings
- */
-int xthal_v2p(void* vaddr, void** paddr, unsigned *way, unsigned* cattr) {
-#if XCHAL_HAVE_XEA2
-#if XCHAL_HAVE_MPU
-  if (paddr)
-    *paddr = vaddr;
-  if (way)
-    *way = 0;
-  if (cattr)
-  {
-      struct xthal_MPU_entry x = xthal_get_entry_for_address(vaddr, 0);
-      *cattr = XTHAL_MPU_ENTRY_GET_ACCESS(x) | XTHAL_MPU_ENTRY_GET_MEMORY_TYPE(x) << XTHAL_AR_WIDTH;
-  }
-  return XTHAL_SUCCESS;
-#else
-	unsigned long probe = probe_dtlb((unsigned) vaddr);
-#if !XCHAL_HAVE_PTP_MMU
-	if (!(0x1 & probe))
-	return XTHAL_NO_MAPPING;
-	if (way)
-	*way = 1;
-	if (paddr || cattr) {
-		unsigned long temp;
-		temp = read_dtlb1_entry(probe);
-		unsigned ppn = 0xe0000000 & temp;
-		unsigned att = 0xf & temp;
-		if (paddr)
-		*paddr = ((void*) (ppn + (((unsigned) vaddr) & 0x1fffffff)));
-		if (cattr)
-		*cattr = att;
-	}
-#else
-	{
-		unsigned iway;
-		if (!(0x10 & probe))
-			return XTHAL_NO_MAPPING;
-		iway = 0xf & probe;
-		if (way)
-			*way = iway;
-		if (paddr || cattr) {
-			unsigned temp;
-			unsigned ppn;
-			unsigned ppn1;
-			unsigned dtlbcfg = read_dtlbcfg();
-			temp = read_dtlb1_entry(probe);
-			unsigned att = 0xf & temp;
-			if (cattr)
-				*cattr = att;
-			if (paddr)
-				switch (iway) // followin code derived from fig 4-40 from ISA MMU Option Data (at) Format for RxTLB1
-				{ /* 4k pages */
-				case 0:
-				case 1:
-				case 2:
-				case 3:
-				case 7:
-				case 8:
-				case 9:
-					ppn = 0xfffff000; // 4k pages
-					break;
-				case 4: {
-					switch ((dtlbcfg & (0x3 << 16)) >> 16) // bits 16 & 17
-					{
-					case 0: // 1MB pages
-						ppn = 0xfff00000;
-						break;
-					case 1: // 4MB pages
-						ppn = 0xffc00000;
-						break;
-					case 2: // 16MB pages
-						ppn = 0xff000000;
-						break;
-					case 3: // 64MB pages
-						ppn = 0xfc000000;
-						break;
-					default:
-						return XTHAL_UNSUPPORTED;
-					}
-				}
-					break;
-				case 5:
-					if ((dtlbcfg & (1 << 20)))
-						ppn = 0xf8000000; // 128MB pages
-					else
-						ppn = 0xf0000000; // 256MB pages
-					break;
-				case 6:
-					if ((dtlbcfg & (1 << 24)))
-						ppn = 0xe0000000; // 512MB pages
-					else
-						ppn = 0xf0000000; // 256MB pages
-					break;
-				default:
-					return XTHAL_UNSUPPORTED;
-					break;
-				}
-			ppn1 = ppn & temp;
-			*paddr = ((void*) (ppn1 + (((unsigned) vaddr) & (~ppn))));
-		}
-	}
-#endif
-	return XTHAL_SUCCESS;
-#endif
-#else
-	return XTHAL_UNSUPPORTED;
-#endif
-}
-
-/* these constants borrowed from xthal_set_region_attribute */
-# if XCHAL_HAVE_PTP_MMU
-#  define CA_BYPASS		XCHAL_CA_BYPASS
-#  define CA_WRITETHRU		XCHAL_CA_WRITETHRU
-#  define CA_WRITEBACK		XCHAL_CA_WRITEBACK
-#  define CA_WRITEBACK_NOALLOC	XCHAL_CA_WRITEBACK_NOALLOC
-#  define CA_ILLEGAL		XCHAL_CA_ILLEGAL
-# else
-/*  Hardcode these, because they get remapped when caches or writeback not configured:  */
-#  define CA_BYPASS		2
-#  define CA_WRITETHRU		1
-#  define CA_WRITEBACK		4
-#  define CA_WRITEBACK_NOALLOC	5
-#  define CA_ILLEGAL		15
-# endif
-
-/* internal function that returns 1 if the supplied attr indicates the
- * cache is in writeback mode.
- */
-static inline int is_writeback(unsigned attr) {
-#if XCHAL_HAVE_XLT_CACHEATTR
-	return attr == CA_WRITEBACK || attr == CA_WRITEBACK_NOALLOC;
-#endif
-#if XCHAL_HAVE_PTP_MMU && XCHAL_HAVE_SPANNING_WAY
-	return (attr | 0x3) == CA_WRITEBACK;
-#endif
-	return -1; /* unsupported */
-}
-
-/*
- *  xthal_set_region_translation()
- *
- *  Establishes a new mapping (with the supplied cache attributes)
- *  between a virtual address region, and a physical address region.
- *
- *  This function is only supported with following processor configurations:
- *  				a) Region Translation
- *  				b) v3 MMU with a spanning way running in the default mode
- *
- *  If the specified memory range exactly covers a series
- *  of consecutive 512 MB regions, the address mapping and cache
- *  attributes of these regions are updated.
- *
- *  If this is not the case, e.g. if either or both the
- *  start and end of the range only partially cover a 512 MB
- *  region, one of three results are possible:
- *
- *	1.  By default, the cache attribute of all regions
- *	    covered, even just partially, is changed to
- *	    the requested attribute.
- *
- *	2.  If the XTHAL_CAFLAG_EXACT flag is specified,
- *	    a non-zero error code is returned.
- *
- *	3.  If the XTHAL_CAFLAG_NO_PARTIAL flag is specified
- *	    (but not the EXACT flag), only regions fully
- *	    covered by the specified range are updated with
- *	    the requested attribute.
- *
- *  CACHE HANDLING
- *
- *  This function automatically writes back dirty data before remapping a
- *  virtual address region.
- *
- *  This writeback is done safely, ie. by first switching to writethrough
- *  mode, and then invoking xthal_dcache_all_writeback(). Such a sequence is
- *  necessary to ensure there is no longer any dirty data in the memory region by the time
- *  this function returns, even in the presence of interrupts, speculation, etc.
- *  This automatic write-back can be disabled using the XTHAL_CAFLAG_NO_AUTO_WB flag.
- *
- *	This function also invalidates the caches after remapping a region because the
- *	cache could contain (now invalid) data from the previous mapping.
- *  This automatic invalidate can be disabled using the XTHAL_CAFLAG_NO_AUTO_INV flag.
- *
- *  Parameters:
- *	vaddr	starting virtual address of region of memory
- *
- *	paddr	starting physical address for the mapping (this should be 512MB aligned to vaddr such that ((vaddr ^ paddr) & 0x10000000 == 0)
- *
- *	size	number of bytes in region of memory
- *		(see above, SPECIFYING THE MEMORY REGION)
- *
- *	cattr	cache attribute (encoded);
- *		typically taken from compile-time HAL constants
- *		XCHAL_CA_{BYPASS, WRITETHRU, WRITEBACK[_NOALLOC], ILLEGAL}
- *		(defined in <xtensa/config/core.h>);
- *		in XEA1, this corresponds to the value of a nibble
- *		in the CACHEATTR register;
- *		in XEA2, this corresponds to the value of the
- *		cache attribute (CA) field of each TLB entry
- *
- *	flags	bitwise combination of flags XTHAL_CAFLAG_*
- *
- *			XTHAL_CAFLAG_EXACT - If this flag is present,
- *			the mapping will only be done if the specified
- *			region exactly matches on or more 512MB pages otherwise
- *			XCHAL_INEXACT is returned (and no mapping is done).
- *
- *			XTHAL_CAFLAG_NO_PARTIAL - If this flag is specified, then
- *			only pages that are completely covered by the specified region
- *			are affected.  If this flag is specified, and no pages are completely
- *			covered by the region, then no pages are affected and XCHAL_NO_REGIONS_COVERED
- *			is returned.
- *
- *
- *
- *  Returns:
- *	XCHAL_SUCCESS 	-			successful, or size is zero
- *
- *	XCHAL_NO_REGIONS_COVERED	- 	XTHAL_CAFLAG_NO_PARTIAL flag specified and address range
- *								is valid with a non-zero size, however no 512 MB region (or page)
- *								is completely covered by the range
- *
- *	XCHAL_INEXACT 				XTHAL_CAFLAG_EXACT flag specified, and address range does
- *								not exactly specify a 512 MB region (or page)
- *
- *	XCHAL_INVALID_ADDRESS		invalid address range specified (wraps around the end of memory)
- *
- *	XCHAL_ADDRESS_MISALIGNED	virtual and physical addresses are not aligned (512MB)
- *
- *
- *	XCHAL_UNSUPPORTED_ON_THIS_ARCH	function not supported in this processor configuration
- */
-int xthal_set_region_translation(void* vaddr, void* paddr, unsigned size,
-		unsigned cattr, unsigned flags) {
-#if XCHAL_HAVE_XEA2 & !XCHAL_HAVE_MPU
-#if XCHAL_HAVE_XLT_CACHEATTR || (XCHAL_HAVE_PTP_MMU && XCHAL_HAVE_SPANNING_WAY)
-	const unsigned CA_MASK = 0xF;
-	const unsigned addr_mask = 0x1fffffff;
-	const unsigned addr_shift = 29;
-	unsigned vaddr_a = (unsigned) vaddr;
-	unsigned paddr_a = (unsigned) paddr;
-	unsigned end_vaddr;
-	unsigned end_paddr;
-	unsigned start_va_reg;
-	unsigned end_va_reg;
-	unsigned start_pa_reg;
-	unsigned icache_attr = 0;
-	int rv;
-	int i;
-	if (size == 0)
-		return XTHAL_SUCCESS;
-	if ((vaddr_a & addr_mask) ^ (paddr_a & addr_mask))
-		return XTHAL_ADDRESS_MISALIGNED;
-	icache_attr = cattr & CA_MASK;
-#if (XCHAL_HAVE_PTP_MMU && XCHAL_HAVE_SPANNING_WAY)
-	// if using the mmu in spanning way mode then 'and in' the R, RX, RW, RWX bits
-	if ((cattr & 0x40000000) && (icache_attr < 12))
-		icache_attr = icache_attr & ((cattr & 0xF0) >> 4);
-#endif
-	end_vaddr = vaddr_a + size - 1;
-	end_paddr = paddr_a + size - 1;
-
-	if ((end_vaddr < vaddr_a) || (end_paddr < paddr_a))
-		return XTHAL_INVALID_ADDRESS;
-	start_va_reg = vaddr_a >> addr_shift;
-	end_va_reg = end_vaddr >> addr_shift;
-	start_pa_reg = paddr_a >> addr_shift;
-	if ((flags & XTHAL_CAFLAG_EXACT)
-			&& ((size & addr_mask) || (vaddr_a & addr_mask)
-					|| (paddr_a & addr_mask)))
-		return XTHAL_INEXACT;
-	if (flags & XTHAL_CAFLAG_NO_PARTIAL) {
-		if (vaddr_a & addr_mask) {
-			start_va_reg++;
-			start_pa_reg++;
-		}
-		if ((end_vaddr & addr_mask) != addr_mask)
-			end_va_reg--;
-	}
-	if (end_va_reg < start_va_reg)
-		return XTHAL_NO_REGIONS_COVERED;
-	/*
-	 * Now we need to take care of any uncommitted cache writes in the affected regions
-	 * 1) first determine if any regions are in write back mode
-	 * 2) change those pages to write through
-	 * 3) force the writeback of d-cache by calling xthal_dcach_all_writeback()
-	 */
-#if ((XCHAL_DCACHE_SIZE >0) && XCHAL_DCACHE_IS_WRITEBACK)
-	if (!(flags & XTHAL_CAFLAG_NO_AUTO_WB)) {
-		unsigned old_cache_attr = xthal_get_cacheattr();
-		unsigned cachewrtr = old_cache_attr;
-		unsigned need_safe_writeback = 0;
-		for (i = start_va_reg; i <= end_va_reg; i++) {
-			unsigned sh = i << 2;
-			unsigned old_attr = (old_cache_attr >> sh) & CA_MASK;
-			if (is_writeback(old_attr)) {
-				need_safe_writeback = 1;
-				cachewrtr = (cachewrtr & ~(CA_MASK << sh))
-						| (CA_WRITETHRU << sh);
-			}
-		}
-
-		if (need_safe_writeback) {
-			xthal_set_cacheattr(cachewrtr); /* set to writethru first, to safely writeback any dirty data */
-			xthal_dcache_all_writeback(); /* much quicker than scanning entire 512MB region(s) */
-		}
-	}
-#endif
-	/* Now we set the affected region translations */
-	for (i = start_va_reg; i <= end_va_reg; i++) {
-		if ((rv = xthal_set_region_translation_raw(
-				(void*) ((start_va_reg++) << addr_shift),
-				(void*) ((start_pa_reg++) << addr_shift), icache_attr)))
-			return rv;
-	}
-
-	/*
-	 * Now we need to invalidate the cache in the affected regions. For now invalidate entire cache,
-	 * but investigate if there are faster alternatives on some architectures.
-	 */
-	if (!(flags & XTHAL_CAFLAG_NO_AUTO_INV)) {
-# if XCHAL_DCACHE_SIZE > 0
-		xthal_dcache_all_writeback_inv(); /* some areas in memory (outside the intended region) may have uncommitted
-		 data so we need the writeback_inv(). */
-#endif
-#if	XCHAL_ICACHE_SIZE >0
-		xthal_icache_all_invalidate();
-#endif
-	}
-	return XTHAL_SUCCESS;
-#else
-	return XTHAL_UNSUPPORTED;
-#endif
-#else
-	return XTHAL_UNSUPPORTED;
-#endif
-}
-
-/* xthal_invalidate_region()
- * invalidates the tlb entry for the specified region.
- *
- * This function is only supported on processor configurations 
- * with a v3 MMU with a spanning way.
- *
- * Parameter
- * vaddr - virtual address of region to invalidate (512MB aligned)
- *
- * returns:
- * XCHAL_SUCCESS 					- Success
- * XCHAL_UNSUPPORTED_ON_THIS_ARCH 			- Unsupported
- *
- */
-int xthal_invalidate_region(void* vaddr) {
-#if XCHAL_HAVE_XEA2 & !XCHAL_HAVE_MPU
-#if (XCHAL_HAVE_PTP_MMU && XCHAL_HAVE_SPANNING_WAY)
-	unsigned addr = (unsigned) vaddr;
-	if (addr & 0x1fffffff)
-		return XTHAL_INVALID_ADDRESS;
-	addr += XCHAL_SPANNING_WAY;
-	invalidate_dtlb_entry(addr);
-	invalidate_itlb_entry(addr);
-	return XTHAL_SUCCESS;
-#else
-	return XTHAL_UNSUPPORTED;
-#endif
-#else
-	return XTHAL_UNSUPPORTED;
-#endif
-}
-
+/*
+ * Copyright (c) 2004-2014 Tensilica Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include <xtensa/config/core.h>
+
+#if XCHAL_HAVE_XEA2 && (!XCHAL_HAVE_MPU)
+/*
+ * C-stubs to issue the tlb related instructions (with dsync and isync's if needed).
+ *
+ */
+static inline void write_dtlb_entry(unsigned vpn_way, unsigned ppn_ca) {
+	__asm__ __volatile__("wdtlb  %1, %0; dsync\n\t"
+			: : "r" (vpn_way), "r" (ppn_ca) );
+}
+
+static inline void write_itlb_entry(unsigned vpn_way, unsigned ppn_ca) {
+	__asm__ __volatile__("witlb  %1, %0; isync\n\t"
+			: : "r" (vpn_way), "r" (ppn_ca) );
+}
+
+static inline unsigned read_dtlb1_entry(unsigned addr) {
+	unsigned long tmp;
+	__asm__ __volatile__("rdtlb1  %0, %1\n\t"
+			: "=a" (tmp)
+			: "a" (addr));
+	return tmp;
+}
+
+static inline unsigned read_itlb1_entry(unsigned addr) {
+	unsigned long tmp;
+	__asm__ __volatile__("ritlb1  %0, %1\n\t"
+			: "=a" (tmp)
+			: "a" (addr));
+	return tmp;
+}
+
+static inline unsigned probe_dtlb(unsigned addr) {
+	unsigned long tmp;
+	__asm__ __volatile__("pdtlb  %0, %1\n\t"
+			: "=a" (tmp)
+			: "a" (addr));
+	return tmp;
+}
+
+static inline unsigned probe_itlb(unsigned addr) {
+	unsigned long tmp;
+	__asm__ __volatile__("pitlb  %0, %1\n\t"
+			: "=a" (tmp)
+			: "a" (addr));
+	return tmp;
+}
+
+static inline void invalidate_dtlb_entry(unsigned addr) {
+	__asm__ __volatile__("idtlb  %0; dsync \n\t"
+			: : "a" (addr));
+}
+
+static inline void invalidate_itlb_entry(unsigned addr) {
+	__asm__ __volatile__("iitlb  %0 ; isync\n\t"
+			: : "a" (addr));
+}
+
+static inline unsigned read_dtlbcfg() {
+	unsigned long tmp;
+	__asm__ __volatile__("rsr.dtlbcfg %0\n\t"
+			: "=a" (tmp));
+	return tmp;
+}
+
+static inline unsigned read_itlbcfg() {
+	unsigned long tmp;
+	__asm__ __volatile__("rsr.itlbcfg %0\n\t"
+			: "=a" (tmp));
+	return tmp;
+}
+
+#endif
+
+/*
+ *  xthal_set_region_translation_raw is a quick and simple function
+ *  to set both physical address <paddr> and cache attribute <cattr> for
+ *  a 512MB region at <vaddr>.
+ *
+ *  Parameters:
+ *  void* vaddr		512MB aligned pointer representing the start of virtual address region
+ *  void* paddr		512MB aligned pointer representing the start of physical address region
+ *  unsigned cattr	4 bit value encoding the caching properties and rights (MMU only).
+ *
+ *  returns 0 (XCHAL_SUCCESS) if successful
+ *  returns non zero (XCHAL_UNSUPPORTED) on failure
+ *
+ *  This function has the following limitations:
+ *
+ *  1) Requires either the Region Translation Option or a v3 MMU running in the default mode (with spanning way)
+ *  2) It does no error checking.
+ *  3) Deals with one 512MB region (vaddr and paddr are required to be 512MB aligned although that is not explicitly checked)
+ *  4) It requires the caller to do any cache flushing that is needed
+ *  5) Doesn't support mnemonically setting the 'rights' (rwx, rw, ... ) bit on the MMU
+ *  6) It is illegal to change the mapping of the region containing the current PC (not checked)
+ *
+ */
+int xthal_set_region_translation_raw(void *vaddr, void *paddr, unsigned cattr) {
+#if XCHAL_HAVE_MPU
+	return XTHAL_UNSUPPORTED;
+#else
+#if XCHAL_HAVE_XEA2
+#if XCHAL_HAVE_XLT_CACHEATTR || (XCHAL_HAVE_PTP_MMU && XCHAL_HAVE_SPANNING_WAY)
+# if XCHAL_HAVE_XLT_CACHEATTR
+	unsigned vpn_way = (unsigned)vaddr;
+# else
+	unsigned vpn_way = ((unsigned) vaddr & 0xFFFFFFF0) + XCHAL_SPANNING_WAY;
+# endif
+	unsigned ppn_ca = ((unsigned) paddr & 0xFFFFFFF0) + (cattr & 0xF);
+	write_dtlb_entry(vpn_way, ppn_ca);
+	write_itlb_entry(vpn_way, ppn_ca);
+	return XTHAL_SUCCESS;
+#else
+	return XTHAL_UNSUPPORTED;
+#endif
+#else
+	return XTHAL_UNSUPPORTED;
+#endif
+#endif
+}
+
+/*
+ * xthal_v2p() takes a virtual address as input, and if that virtual address is mapped to a physical address
+ * by the MMU, it returns the:
+ * 		a) corresponding physical address
+ * 		b) the tlb way that is used to translate the address
+ * 		c) cache attribute for translation
+ *
+ * 	Parameters:
+ * 	void* 		vaddr		A pointer representing the virtual address (there are no alignment requirements for this address)
+ * 	void**		paddr		This value can be 0, or can point to a pointer variable which will be updated to contain the physical address
+ * 	unsigned*	way			This value can be 0, or can point to an unsigned variable which will be updated to contain the TLB way.
+ * 	unsigned*   cattr		This value can be 0, or can point to an unsigned variable which will be updated to contain the cache attr
+ * 	                        For MPU configurations bits 0..3 hold the access rights and bits 4..8 hold the encoded memory type
+ *
+ *  Returns 	0 (XCHAL_SUCCESS) 				if successful
+ * 				XTHAL_NO_MAPPING				if there is no current mapping for the virtual address
+ * 				XCHAL_UNSUPPORTED            	if unsupported
+ *
+ * 	Limitations:
+ * 					Assumes that architecture variable DVARWAY56 is "Variable"
+ * 					Uses the D-TLBS for the translation ... assumption is that ITLB's have same mappings
+ */
+int xthal_v2p(void* vaddr, void** paddr, unsigned *way, unsigned* cattr) {
+#if XCHAL_HAVE_XEA2
+#if XCHAL_HAVE_MPU
+  if (paddr)
+    *paddr = vaddr;
+  if (way)
+    *way = 0;
+  if (cattr)
+  {
+      struct xthal_MPU_entry x = xthal_get_entry_for_address(vaddr, 0);
+      *cattr = XTHAL_MPU_ENTRY_GET_ACCESS(x) | XTHAL_MPU_ENTRY_GET_MEMORY_TYPE(x) << XTHAL_AR_WIDTH;
+  }
+  return XTHAL_SUCCESS;
+#else
+	unsigned long probe = probe_dtlb((unsigned) vaddr);
+#if !XCHAL_HAVE_PTP_MMU
+	if (!(0x1 & probe))
+	return XTHAL_NO_MAPPING;
+	if (way)
+	*way = 1;
+	if (paddr || cattr) {
+		unsigned long temp;
+		temp = read_dtlb1_entry(probe);
+		unsigned ppn = 0xe0000000 & temp;
+		unsigned att = 0xf & temp;
+		if (paddr)
+		*paddr = ((void*) (ppn + (((unsigned) vaddr) & 0x1fffffff)));
+		if (cattr)
+		*cattr = att;
+	}
+#else
+	{
+		unsigned iway;
+		if (!(0x10 & probe))
+			return XTHAL_NO_MAPPING;
+		iway = 0xf & probe;
+		if (way)
+			*way = iway;
+		if (paddr || cattr) {
+			unsigned temp;
+			unsigned ppn;
+			unsigned ppn1;
+			unsigned dtlbcfg = read_dtlbcfg();
+			temp = read_dtlb1_entry(probe);
+			unsigned att = 0xf & temp;
+			if (cattr)
+				*cattr = att;
+			if (paddr)
+				switch (iway) // followin code derived from fig 4-40 from ISA MMU Option Data (at) Format for RxTLB1
+				{ /* 4k pages */
+				case 0:
+				case 1:
+				case 2:
+				case 3:
+				case 7:
+				case 8:
+				case 9:
+					ppn = 0xfffff000; // 4k pages
+					break;
+				case 4: {
+					switch ((dtlbcfg & (0x3 << 16)) >> 16) // bits 16 & 17
+					{
+					case 0: // 1MB pages
+						ppn = 0xfff00000;
+						break;
+					case 1: // 4MB pages
+						ppn = 0xffc00000;
+						break;
+					case 2: // 16MB pages
+						ppn = 0xff000000;
+						break;
+					case 3: // 64MB pages
+						ppn = 0xfc000000;
+						break;
+					default:
+						return XTHAL_UNSUPPORTED;
+					}
+				}
+					break;
+				case 5:
+					if ((dtlbcfg & (1 << 20)))
+						ppn = 0xf8000000; // 128MB pages
+					else
+						ppn = 0xf0000000; // 256MB pages
+					break;
+				case 6:
+					if ((dtlbcfg & (1 << 24)))
+						ppn = 0xe0000000; // 512MB pages
+					else
+						ppn = 0xf0000000; // 256MB pages
+					break;
+				default:
+					return XTHAL_UNSUPPORTED;
+					break;
+				}
+			ppn1 = ppn & temp;
+			*paddr = ((void*) (ppn1 + (((unsigned) vaddr) & (~ppn))));
+		}
+	}
+#endif
+	return XTHAL_SUCCESS;
+#endif
+#else
+	return XTHAL_UNSUPPORTED;
+#endif
+}
+
+/* these constants borrowed from xthal_set_region_attribute */
+# if XCHAL_HAVE_PTP_MMU
+#  define CA_BYPASS		XCHAL_CA_BYPASS
+#  define CA_WRITETHRU		XCHAL_CA_WRITETHRU
+#  define CA_WRITEBACK		XCHAL_CA_WRITEBACK
+#  define CA_WRITEBACK_NOALLOC	XCHAL_CA_WRITEBACK_NOALLOC
+#  define CA_ILLEGAL		XCHAL_CA_ILLEGAL
+# else
+/*  Hardcode these, because they get remapped when caches or writeback not configured:  */
+#  define CA_BYPASS		2
+#  define CA_WRITETHRU		1
+#  define CA_WRITEBACK		4
+#  define CA_WRITEBACK_NOALLOC	5
+#  define CA_ILLEGAL		15
+# endif
+
+/* internal function that returns 1 if the supplied attr indicates the
+ * cache is in writeback mode.
+ */
+static inline int is_writeback(unsigned attr) {
+#if XCHAL_HAVE_XLT_CACHEATTR
+	return attr == CA_WRITEBACK || attr == CA_WRITEBACK_NOALLOC;
+#endif
+#if XCHAL_HAVE_PTP_MMU && XCHAL_HAVE_SPANNING_WAY
+	return (attr | 0x3) == CA_WRITEBACK;
+#endif
+	return -1; /* unsupported */
+}
+
+/*
+ *  xthal_set_region_translation()
+ *
+ *  Establishes a new mapping (with the supplied cache attributes)
+ *  between a virtual address region, and a physical address region.
+ *
+ *  This function is only supported with following processor configurations:
+ *  				a) Region Translation
+ *  				b) v3 MMU with a spanning way running in the default mode
+ *
+ *  If the specified memory range exactly covers a series
+ *  of consecutive 512 MB regions, the address mapping and cache
+ *  attributes of these regions are updated.
+ *
+ *  If this is not the case, e.g. if either or both the
+ *  start and end of the range only partially cover a 512 MB
+ *  region, one of three results are possible:
+ *
+ *	1.  By default, the cache attribute of all regions
+ *	    covered, even just partially, is changed to
+ *	    the requested attribute.
+ *
+ *	2.  If the XTHAL_CAFLAG_EXACT flag is specified,
+ *	    a non-zero error code is returned.
+ *
+ *	3.  If the XTHAL_CAFLAG_NO_PARTIAL flag is specified
+ *	    (but not the EXACT flag), only regions fully
+ *	    covered by the specified range are updated with
+ *	    the requested attribute.
+ *
+ *  CACHE HANDLING
+ *
+ *  This function automatically writes back dirty data before remapping a
+ *  virtual address region.
+ *
+ *  This writeback is done safely, ie. by first switching to writethrough
+ *  mode, and then invoking xthal_dcache_all_writeback(). Such a sequence is
+ *  necessary to ensure there is no longer any dirty data in the memory region by the time
+ *  this function returns, even in the presence of interrupts, speculation, etc.
+ *  This automatic write-back can be disabled using the XTHAL_CAFLAG_NO_AUTO_WB flag.
+ *
+ *	This function also invalidates the caches after remapping a region because the
+ *	cache could contain (now invalid) data from the previous mapping.
+ *  This automatic invalidate can be disabled using the XTHAL_CAFLAG_NO_AUTO_INV flag.
+ *
+ *  Parameters:
+ *	vaddr	starting virtual address of region of memory
+ *
+ *	paddr	starting physical address for the mapping (this should be 512MB aligned to vaddr such that ((vaddr ^ paddr) & 0x10000000 == 0)
+ *
+ *	size	number of bytes in region of memory
+ *		(see above, SPECIFYING THE MEMORY REGION)
+ *
+ *	cattr	cache attribute (encoded);
+ *		typically taken from compile-time HAL constants
+ *		XCHAL_CA_{BYPASS, WRITETHRU, WRITEBACK[_NOALLOC], ILLEGAL}
+ *		(defined in <xtensa/config/core.h>);
+ *		in XEA1, this corresponds to the value of a nibble
+ *		in the CACHEATTR register;
+ *		in XEA2, this corresponds to the value of the
+ *		cache attribute (CA) field of each TLB entry
+ *
+ *	flags	bitwise combination of flags XTHAL_CAFLAG_*
+ *
+ *			XTHAL_CAFLAG_EXACT - If this flag is present,
+ *			the mapping will only be done if the specified
+ *			region exactly matches on or more 512MB pages otherwise
+ *			XCHAL_INEXACT is returned (and no mapping is done).
+ *
+ *			XTHAL_CAFLAG_NO_PARTIAL - If this flag is specified, then
+ *			only pages that are completely covered by the specified region
+ *			are affected.  If this flag is specified, and no pages are completely
+ *			covered by the region, then no pages are affected and XCHAL_NO_REGIONS_COVERED
+ *			is returned.
+ *
+ *
+ *
+ *  Returns:
+ *	XCHAL_SUCCESS 	-			successful, or size is zero
+ *
+ *	XCHAL_NO_REGIONS_COVERED	- 	XTHAL_CAFLAG_NO_PARTIAL flag specified and address range
+ *								is valid with a non-zero size, however no 512 MB region (or page)
+ *								is completely covered by the range
+ *
+ *	XCHAL_INEXACT 				XTHAL_CAFLAG_EXACT flag specified, and address range does
+ *								not exactly specify a 512 MB region (or page)
+ *
+ *	XCHAL_INVALID_ADDRESS		invalid address range specified (wraps around the end of memory)
+ *
+ *	XCHAL_ADDRESS_MISALIGNED	virtual and physical addresses are not aligned (512MB)
+ *
+ *
+ *	XCHAL_UNSUPPORTED_ON_THIS_ARCH	function not supported in this processor configuration
+ */
+int xthal_set_region_translation(void* vaddr, void* paddr, unsigned size,
+		unsigned cattr, unsigned flags) {
+#if XCHAL_HAVE_XEA2 & !XCHAL_HAVE_MPU
+#if XCHAL_HAVE_XLT_CACHEATTR || (XCHAL_HAVE_PTP_MMU && XCHAL_HAVE_SPANNING_WAY)
+	const unsigned CA_MASK = 0xF;
+	const unsigned addr_mask = 0x1fffffff;
+	const unsigned addr_shift = 29;
+	unsigned vaddr_a = (unsigned) vaddr;
+	unsigned paddr_a = (unsigned) paddr;
+	unsigned end_vaddr;
+	unsigned end_paddr;
+	unsigned start_va_reg;
+	unsigned end_va_reg;
+	unsigned start_pa_reg;
+	unsigned icache_attr = 0;
+	int rv;
+	int i;
+	if (size == 0)
+		return XTHAL_SUCCESS;
+	if ((vaddr_a & addr_mask) ^ (paddr_a & addr_mask))
+		return XTHAL_ADDRESS_MISALIGNED;
+	icache_attr = cattr & CA_MASK;
+#if (XCHAL_HAVE_PTP_MMU && XCHAL_HAVE_SPANNING_WAY)
+	// if using the mmu in spanning way mode then 'and in' the R, RX, RW, RWX bits
+	if ((cattr & 0x40000000) && (icache_attr < 12))
+		icache_attr = icache_attr & ((cattr & 0xF0) >> 4);
+#endif
+	end_vaddr = vaddr_a + size - 1;
+	end_paddr = paddr_a + size - 1;
+
+	if ((end_vaddr < vaddr_a) || (end_paddr < paddr_a))
+		return XTHAL_INVALID_ADDRESS;
+	start_va_reg = vaddr_a >> addr_shift;
+	end_va_reg = end_vaddr >> addr_shift;
+	start_pa_reg = paddr_a >> addr_shift;
+	if ((flags & XTHAL_CAFLAG_EXACT)
+			&& ((size & addr_mask) || (vaddr_a & addr_mask)
+					|| (paddr_a & addr_mask)))
+		return XTHAL_INEXACT;
+	if (flags & XTHAL_CAFLAG_NO_PARTIAL) {
+		if (vaddr_a & addr_mask) {
+			start_va_reg++;
+			start_pa_reg++;
+		}
+		if ((end_vaddr & addr_mask) != addr_mask)
+			end_va_reg--;
+	}
+	if (end_va_reg < start_va_reg)
+		return XTHAL_NO_REGIONS_COVERED;
+	/*
+	 * Now we need to take care of any uncommitted cache writes in the affected regions
+	 * 1) first determine if any regions are in write back mode
+	 * 2) change those pages to write through
+	 * 3) force the writeback of d-cache by calling xthal_dcach_all_writeback()
+	 */
+#if ((XCHAL_DCACHE_SIZE >0) && XCHAL_DCACHE_IS_WRITEBACK)
+	if (!(flags & XTHAL_CAFLAG_NO_AUTO_WB)) {
+		unsigned old_cache_attr = xthal_get_cacheattr();
+		unsigned cachewrtr = old_cache_attr;
+		unsigned need_safe_writeback = 0;
+		for (i = start_va_reg; i <= end_va_reg; i++) {
+			unsigned sh = i << 2;
+			unsigned old_attr = (old_cache_attr >> sh) & CA_MASK;
+			if (is_writeback(old_attr)) {
+				need_safe_writeback = 1;
+				cachewrtr = (cachewrtr & ~(CA_MASK << sh))
+						| (CA_WRITETHRU << sh);
+			}
+		}
+
+		if (need_safe_writeback) {
+			xthal_set_cacheattr(cachewrtr); /* set to writethru first, to safely writeback any dirty data */
+			xthal_dcache_all_writeback(); /* much quicker than scanning entire 512MB region(s) */
+		}
+	}
+#endif
+	/* Now we set the affected region translations */
+	for (i = start_va_reg; i <= end_va_reg; i++) {
+		if ((rv = xthal_set_region_translation_raw(
+				(void*) ((start_va_reg++) << addr_shift),
+				(void*) ((start_pa_reg++) << addr_shift), icache_attr)))
+			return rv;
+	}
+
+	/*
+	 * Now we need to invalidate the cache in the affected regions. For now invalidate entire cache,
+	 * but investigate if there are faster alternatives on some architectures.
+	 */
+	if (!(flags & XTHAL_CAFLAG_NO_AUTO_INV)) {
+# if XCHAL_DCACHE_SIZE > 0
+		xthal_dcache_all_writeback_inv(); /* some areas in memory (outside the intended region) may have uncommitted
+		 data so we need the writeback_inv(). */
+#endif
+#if	XCHAL_ICACHE_SIZE >0
+		xthal_icache_all_invalidate();
+#endif
+	}
+	return XTHAL_SUCCESS;
+#else
+	return XTHAL_UNSUPPORTED;
+#endif
+#else
+	return XTHAL_UNSUPPORTED;
+#endif
+}
+
+/* xthal_invalidate_region()
+ * invalidates the tlb entry for the specified region.
+ *
+ * This function is only supported on processor configurations 
+ * with a v3 MMU with a spanning way.
+ *
+ * Parameter
+ * vaddr - virtual address of region to invalidate (512MB aligned)
+ *
+ * returns:
+ * XCHAL_SUCCESS 					- Success
+ * XCHAL_UNSUPPORTED_ON_THIS_ARCH 			- Unsupported
+ *
+ */
+int xthal_invalidate_region(void* vaddr) {
+#if XCHAL_HAVE_XEA2 & !XCHAL_HAVE_MPU
+#if (XCHAL_HAVE_PTP_MMU && XCHAL_HAVE_SPANNING_WAY)
+	unsigned addr = (unsigned) vaddr;
+	if (addr & 0x1fffffff)
+		return XTHAL_INVALID_ADDRESS;
+	addr += XCHAL_SPANNING_WAY;
+	invalidate_dtlb_entry(addr);
+	invalidate_itlb_entry(addr);
+	return XTHAL_SUCCESS;
+#else
+	return XTHAL_UNSUPPORTED;
+#endif
+#else
+	return XTHAL_UNSUPPORTED;
+#endif
+}
+
diff --git a/src/arch/xtensa/include/xtensa/c6x-compat.h b/src/arch/xtensa/include/xtensa/c6x-compat.h
index 4b17987ea..ca91bd718 100755
--- a/src/arch/xtensa/include/xtensa/c6x-compat.h
+++ b/src/arch/xtensa/include/xtensa/c6x-compat.h
@@ -1,1758 +1,1758 @@
-/*
- * Copyright (c) 2006-2010 Tensilica Inc. ALL RIGHTS RESERVED.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-#ifndef __C6X_COMPAT__H
-#define __C6X_COMPAT__H
-
-/* Unimplemented functions _gmpy, _gmpy4, _xormpy, _lssub, _cmpy, _cmpyr,
-   _cmpyr1, _ddotpl2r, _ddotph2r */
-
-
-typedef long long C6X_COMPAT_LONG40;
-
-
-#define _memd8(a) (*((double*)(a)))
-#define _memd8_const(a) (*((const double*)(a)))
-
-#define _amemd8(a) (*((double*)(a)))
-#define _amemd8_const(a) (*((const double*)(a)))
-
-#define _mem8(a) (*((unsigned long long*)(a)))
-#define _mem8_const(a) (*((const unsigned long long*)(a)))
-
-#define _mem4(a) (*((unsigned*)(a)))
-#define _mem4_const(a) (*((const unsigned*)(a)))
-#define _amem4_const(a) (*((const unsigned*)(a)))
-
-/*  NOTE: To emulate a C6X properly you should define global variables
-    for your Xtensa with these names.  Some of the emulation routines
-    will set these values.  */
-
-extern int _carry;
-extern int _overflow;
-
-//   Utility routines
-
-
-#define TESTBIT(x,n) (((x) >> (n)) & 1)
-
-#define NSA_BITS 32
-
-static inline unsigned int norm_shift_amt_U_and_non_U(int is_signed, int inp) {
-int j=0, k=0;
-int x=inp;
-if (is_signed) {
-    /* Invert signed val if negative */
-    x= TESTBIT(x,(NSA_BITS-1))? ~x: x;
-    x= (x&1)|(x<<1);		/* Shift up to return count-1 */
-    if (x ==0) 
-      return NSA_BITS-1;
-  }
-  if (x ==0) 
-    return NSA_BITS;
-  /* Now count leading zeros */
-  for (j=0, k=NSA_BITS-1; k>=0; j++, k--) {
-    if (TESTBIT(x,k)) 
-      return j;
-  }
-  return NSA_BITS;
-}
-
-
-
-static inline long long
-orig_L40_set( long long L40_var1) {
-   long long L40_var_out;
-   
-   L40_var_out =  L40_var1 & 0x000000ffffffffffLL;
-
-   if( L40_var1 & 0x8000000000LL)
-      L40_var_out = L40_var_out | 0xffffff0000000000LL;
-
-   return( L40_var_out);
-}
-
-
-
-static inline signed long long
-util_saturate_n_no_state(signed long long t, int n)
-{
-  signed long long maxv, minv;
-  maxv = (1LL << (n-1)) - 1;
-  minv = (-1LL << (n-1));
-  if (t > maxv) {
-    t = maxv;
-  } else if (t < minv) {
-    t = minv;
-  }
-  return t;
-}
-
-
-static inline signed long long
-util_saturate_n_sgn(signed long long t, int n)
-{
-  signed long long result;
-  signed long long maxv, minv;
-  maxv = (1LL << (n-1)) - 1;
-  minv = (-1LL << (n-1));
-  if (t > 0) {
-    result = maxv;
-    _overflow = 1;
-  } else if (t < 0) {
-    result = minv;
-    _overflow = 1;
-  } else {
-    result = 0;
-  }
-  return result;
-}
-
-
-
-
-/* well-behaved signed shift right (left on negative) with
-   saturation */
-static inline signed long long
-util_shift_right_saturate_n(signed long long t, int shval, int n)
-{
-  /* n should be <= 62 */
-  long long result;
-
-  signed long long mask;
-  int actual_shift = shval;
-  long long shft = actual_shift > 0 ? actual_shift : -actual_shift;
-
-  if (t == 0 || actual_shift == 0)
-    return t;
-
-  if (actual_shift >= n) {
-    return (t < 0) ? -1 : 0;
-  }
-  if (actual_shift <= -n) {
-    return util_saturate_n_sgn(t, n);
-  }
-  if (actual_shift > 0) {
-    return t >> actual_shift;
-  }
-  /* actual_shift < 0. Check for saturation after shift. */
-  mask = (-1LL << (n-shft-1));
-  if (t > 0 && ((mask & t) != 0)) {
-    return util_saturate_n_sgn(t, n);
-  }
-  if (t < 0 && ((mask & t) != mask)) {
-    return util_saturate_n_sgn(t, n);
-  }
-  result = t << shft;
-
-  return result;
-}
-
-
-/* Implemented c6x standard C compatibility functions (alphabetical
-   order) */
-
-
-static inline int _abs(int src1) {
-	if ((unsigned) src1 == (unsigned) 0x80000000) {
-		return 0x7fffffff;
-	}
-	return abs(src1);
-}
-
-
-static inline int _abs2(int src1) {
-	short s1[2],r[2];
-	int result;
-	*((int*)s1) = src1;
-	if ((unsigned short) s1[1] == (unsigned short) 0x8000) r[1] = 0x7fff;
-	   else r[1] = abs(s1[1]);
-	if ((unsigned short) s1[0] == (unsigned short) 0x8000) r[0] = 0x7fff;
-	   else r[0] = abs(s1[0]);
-	result = *(int*)r;
-	return result;
-	}
-
-
-
-
-static inline int _add2(int src1, int src2) {
-	short s1[2], s2[2], r[2];
-	int result;
-	*((int*)s1) = src1;
-	*((int*)s2) = src2;
-	r[0] = s1[0] + s2[0];
-	r[1] = s1[1] + s2[1];
-	result = *(int*)r;
-	return result;
-}
-
-static inline int _add4(int src1, int src2) {
-	char c1[4], c2[4], r[4];
-	int result;
-	*((int*)c1) = src1;
-	*((int*)c2) = src2;
-	r[0] = c1[0] + c2[0];
-	r[1] = c1[1] + c2[1];
-	r[2] = c1[2] + c2[2];
-	r[3] = c1[3] + c2[3];
-	result = *(int*)r;
-	return result;
-}
-
-
-
-static inline long long _addsub(unsigned int src1, unsigned int src2)
-{
-  
-  int res_lo;
-  int res_hi;
- 
-  res_hi = src1+src2;
-  res_lo = src1-src2;
-  return (((unsigned long long) res_hi) << 32) | ((unsigned int) res_lo) ;
-}
-
-  
-static inline long long _addsub2(unsigned int src1, unsigned int src2)
-{
-  short s1[2], s2[2], ra[2], rs[2];
-  int res_lo;
-  int res_hi;
-
-  *((int*)s1) = src1;
-  *((int*)s2) = src2;
-  ra[0] = s1[0] + s2[0];
-  ra[1] = s1[1] + s2[1];
-  rs[0] = s1[0] - s2[0];
-  rs[1] = s1[1] - s2[1];
-  
-  res_hi = *(int*)ra;
-  res_lo = *(int*)rs;
-  return (((unsigned long long) res_hi) << 32) | ((unsigned int) res_lo) ;
-}
-
-
-static inline int _avg2(int src1, int src2) {
-  int low = (((int)1 +  (short) src1 + (short) src2) >> 1) & 0XFFFF;
-  int high1 = src1 >> 16;
-  int high2 = src2 >> 16;
-  int high = ((high1 + high2 + 1) >> 1)<< 16;
-  return high | low;
-}
-
-
-
-static inline unsigned int _avgu4(unsigned int src1, unsigned int src2) {
-unsigned int res0 = ((src1 & 0xFF) + (src2 & 0xFF) + 1) >> 1;
-  unsigned int res1 = (((src1 & 0xFF00) >> 8) + ((src2 & 0xFF00) >> 8) + 1) >> 1;
-  unsigned int res2 = (((src1 & 0xFF0000) >> 16) + ((src2 & 0xFF0000) >> 16) + 1) >> 1;
-  unsigned int res3 = (((src1 & 0xFF000000) >> 24) + ((src2 & 0xFF000000) >> 24) + 1) >> 1;
-  return (res3 << 24) | (res2 << 16) | (res1 << 8) | res0;
-}
-
-
-static inline int TEN_popc (unsigned char b)
-{
-  int i, result = 0;
-  for (i = 0; i <  8; i++){
-    if (b & 0x1)
-      result++;
-    b >>= 1;
-  }
-  return result;
-}
-
-static inline unsigned int _bitc4(unsigned int src1)
-{
-  unsigned int res0 = TEN_popc(src1 & 0xFF);
-  unsigned int res1 = TEN_popc((src1 & 0xFF00) >> 8);
-  unsigned int res2 = TEN_popc((src1 & 0xFF0000) >> 16);
-  unsigned int res3 = TEN_popc((src1 & 0xFF000000) >> 24);
-  return (res3 << 24) | (res2 << 16) | (res1 << 8) | res0;
-}
-
-static inline unsigned int _bitr(unsigned int src) {
-	int i;
-	unsigned r = 0;
-	for (i = 0; i< 32; ++i) {
-		r = r | (((src >> i) & 1)<<(31-i));
-	}
-	return r;
-}
-
-
-static inline unsigned int _clr(unsigned int src2,  int csta,  int cstb)
-{
-  csta &= 0x1f;
-  cstb &= 0x1f;
-  if (csta > cstb)
-    return src2;
-  else {
-    unsigned int mask =  (((1 <<  (cstb -  csta)) <<  1) -  1) <<  csta;
-    return src2 & (~mask);
-  }
-}
-
-static inline unsigned int _clrr(unsigned int src2, int src1)
-{
-  unsigned int csta = (src1 >> 5) & 0x1f;
-  unsigned int cstb = src1 & 0x1f;
-  if (csta > cstb)
-    return src2;
-  else {
-    unsigned int mask =  (((1 <<  (cstb -  csta)) <<  1) -  1) <<  csta;
-    return src2 & (~mask);
-  }
-}
-
-
-
-
-static inline int _cmpeq2(int src1, int src2) {
-	short s1[2], s2[2];
-	int r0, r1;
-	int result;
-	*((int*)s1) = src1;
-	*((int*)s2) = src2;
-	r0 = s1[0] == s2[0] ? 1 : 0;
-	r1 = s1[1] == s2[1] ? 1 : 0;
-	result = (r1 << 1) | r0;
-	return result;
-}
-
-static inline int _cmpeq4(int src1, int src2) {
-	char s1[4], s2[4];
-	int r0, r1, r2, r3;
-	int result;
-	*((int*)s1) = src1;
-	*((int*)s2) = src2;
-	r0 = s1[0] == s2[0] ? 1 : 0;
-	r1 = s1[1] == s2[1] ? 1 : 0;
-	r2 = s1[2] == s2[2] ? 1 : 0;
-	r3 = s1[3] == s2[3] ? 1 : 0;
-	result = (r3 << 3) | (r2 << 2) | (r1 << 1) | r0;
-	return result;
-}
-
-
-static inline int _cmpgt2(int src1, int src2) {
-	short s1[2], s2[2];
-	int r1, r0;
-	int result;
-	*((int*)s1) = src1;
-	*((int*)s2) = src2;
-	r0 = s1[0] > s2[0] ? 1 : 0;
-	r1 = s1[1] > s2[1] ? 1 : 0;
-	result = (r1<<1) | r0;
-	return result;
-}
-
-
-static inline unsigned int _cmpgtu4(unsigned int src1, unsigned int src2) {
-  unsigned int s1_0 = (src1 & 0xFF);
-  unsigned int s1_1 = (src1 & 0xFF00) >> 8;
-  unsigned int s1_2 = (src1 & 0xFF0000) >> 16;
-  unsigned int s1_3 = (src1 & 0xFF000000) >> 24;
-
-  unsigned int s2_0 = (src2 & 0xFF);
-  unsigned int s2_1 = (src2 & 0xFF00) >> 8;
-  unsigned int s2_2 = (src2 & 0xFF0000) >> 16;
-  unsigned int s2_3 = (src2 & 0xFF000000) >> 24;
-
-  unsigned int result = 0;
-
-  if (s1_0 > s2_0)
-    result |= 0x1;
-
-  if (s1_1 > s2_1)
-    result |= 0x2;
-
-  if (s1_2 > s2_2)
-    result |= 0x4;
-
-  if (s1_3 > s2_3)
-    result |= 0x8;
-
-  return result;
-}
-
-
-
-
-static inline long long _ddotp4(unsigned int src1, unsigned int src2) {
-  unsigned int res0, res1;
-  short s1_0 = (src1 & 0xffff);
-  short s1_1 = (src1 & 0xfff0000) >> 16;
-
-  unsigned short s2_0 = (src2 & 0xff);
-  unsigned short s2_1 = (src2 & 0xff00) >> 8;
-  unsigned short s2_2 = (src2 & 0xff0000) >> 16;
-  unsigned short s2_3 = (src2 & 0xff000000) >> 24;
-
-  res0 = ((int)s1_0) * s2_0 + ((int)s1_1) * s2_1;
-  res1 = ((int)s1_0) * s2_2 + ((int)s1_1) * s2_3;
-
-  return (res1 << 16) | res0;
-}
-
-
-static inline long long _ddotph2(long long src1_o_src1_e, unsigned int src2)
-{
-
-  unsigned int src1_o = src1_o_src1_e >> 32;  
-  unsigned int src1_e = src1_o_src1_e & 0xFFFFFFFF;  
-  short ls1_o = src1_o & 0XFFFF;
-  short hs1_o = src1_o >> 16;
-//  short ls1_e = src1_e & 0XFFFF;
-  short hs1_e = src1_e >> 16;
-  short ls2 = src2 & 0XFFFF;
-  short hs2 = src2 >> 16;
-
-  unsigned long long res_hi = ls2 * ls1_o + hs2 * hs1_o;
-  unsigned int res_lo = ls1_o * hs2 + hs1_e * ls2;
-  return (res_hi << 32) | res_lo;
-}
-
-
-static inline long long _ddotpl2(long long src1_o_src1_e, unsigned int src2)
-{
-  unsigned int src1_o = src1_o_src1_e >> 32;  
-  unsigned int src1_e = src1_o_src1_e & 0xFFFFFFFF;  
-  short ls1_o = src1_o & 0XFFFF;
-//  short hs1_o = src1_o >> 16;
-  short ls1_e = src1_e & 0XFFFF;
-  short hs1_e = src1_e >> 16;
-  short ls2 = src2 & 0XFFFF;
-  short hs2 = src2 >> 16;
-
-  unsigned long long res_hi = ls2 * hs1_e + hs2 * ls1_o;
-  unsigned res_lo = hs1_e * hs2 + ls1_e * ls2;
-  return (res_hi << 32) | res_lo;
-}
-
-
-static inline unsigned int _deal(unsigned int src)
-{
-  int i;
-  unsigned short lo = 0, hi = 0;
-  for (i = 0; i < 32; i+= 2) {
-    lo >>= 1;
-    lo |= (src & 0x1) << 15;
-    src >>= 1;
-    hi >>= 1;
-    hi |= (src & 0x1) << 15;
-    src >>= 1;
-  }
-  return (hi << 16) | lo;
-}
-
-
-static inline long long _dmv(unsigned int src1, unsigned int src2)
-{
-  return (((long long) src1) << 32) | src2;
-}
-
-
-static inline int _dotpn2(int src1, int src2) {
-short int s1_h = src1>>16;
-	short int s1_l = src1;
-	short int s2_h = src2>>16;
-	short int s2_l = src2;
-	return s1_h * s2_h - s1_l * s2_l;
-}
-
-
-static inline int _dotp2(int src1, int src2) {
-	short int s1_h = src1>>16;
-	short int s1_l = src1;
-	short int s2_h = src2>>16;
-	short int s2_l = src2;
-	return s1_h * s2_h + s1_l * s2_l;
-}
-
-
-
-static inline int _dotpnrsu2(int src1, unsigned int src2)
-{
-  short ls1 = src1 & 0XFFFF;
-  unsigned short ls2 = src2 & 0XFFFF;
-  short hs1 = src1 >> 16;
-  unsigned short hs2 = src2 >> 16;
-
-  int result = (((long long) (int)(hs1 * hs2)) - ((long long) (int)(ls1 * ls2)) +  (1 << 15)) >> 16;
-  return result;
-}
-
-
-
-static inline int _dotprsu2(int src1, unsigned int src2) {
-  short ls1 = src1 & 0XFFFF;
-  unsigned short ls2 =  (src2 & 0XFFFF);
-  short hs1 = src1 >> 16;
-  unsigned short hs2 =  (src2 >> 16);
-
-  int result = (((long long) (int) (ls1 * ls2)) + ((long long) (int) (hs1 * hs2)) +  (1LL << 15)) >> 16;
-  return result;
-}
-
-
-
-
-
-
-
-static inline int _dotpsu4(int src1, unsigned int src2) {
-  int result;
-  signed char s1_0 = (src1 & 0xff);
-  signed char s1_1 = (src1 & 0xff00) >> 8;
-  signed char s1_2 = (src1 & 0xff0000) >> 16;
-  signed char s1_3 = (src1 & 0xff000000) >> 24;
-
-  unsigned int s2_0 = (src2 & 0xff);
-  unsigned int s2_1 = (src2 & 0xff00) >> 8;
-  unsigned int s2_2 = (src2 & 0xff0000) >> 16;
-  unsigned int s2_3 = (src2 & 0xff000000) >> 24;
-
-  result = s1_0 * s2_0 + s1_1 * s2_1 + s1_2 * s2_2 + s1_3 * s2_3;
-  return result;
-}
-
-
-static inline unsigned int _dotpu4(unsigned int src1, unsigned int src2) {
-	unsigned char v1_0 = src1 & 0xff;
-	unsigned char v1_1 = (src1>>8) & 0xff;
-	unsigned char v1_2 = (src1>>16) & 0xff;
-	unsigned char v1_3 = (src1>>24) & 0xff;
-
-	unsigned char v2_0 = src2 & 0xff;
-	unsigned char v2_1 = (src2>>8) & 0xff;
-	unsigned char v2_2 = (src2>>16) & 0xff;
-	unsigned char v2_3 = (src2>>24) & 0xff;
-
-	unsigned v = v1_0 * v2_0  + v1_1 * v2_1 + v1_2 * v2_2 + v1_3 * v2_3;
-	return v;
-}
-
-
-static inline long long _dpack2(unsigned int src1, unsigned int src2){
-unsigned short s1[2], s2[2];
-*((int*)s1) = src1;
-*((int*)s2) = src2;
-return ((unsigned long long) s1[1] << 48) | ((unsigned long long) s2[1] << 32) | ((unsigned long long) s1[0] << 16) | ((unsigned long long) s2[0]);
-}
-
-
-static inline long long _dpackx2(unsigned int src1, unsigned int src2){
-unsigned short s1[2], s2[2];
-*((int*)s1) = src1;
-*((int*)s2) = src2;
-return ((unsigned long long) s2[0] << 48) | ((unsigned long long) s1[1] << 32) | ((unsigned long long) s1[0] << 16) | ((unsigned long long) s2[1]);
-}
-
-static inline int _ext(int src2, unsigned int csta, unsigned int cstb)
-{
-  return (src2 << csta) >> cstb;
-}
-
-static inline int _extr(int src2, int src1)
-{
-  unsigned int csta = (src1 >> 5) & 0x1f;
-  unsigned int cstb = src1 & 0x1f;
-  return (src2 << csta) >> cstb;
-}
-
-static inline unsigned int _extu(unsigned int src2, unsigned int csta, unsigned int cstb)
-{
-  return (src2 << csta) >> cstb;
-}
-
-static inline unsigned int _extur(unsigned int src2, int src1)
-{
-  unsigned int csta = (src1 >> 5) & 0x1f;
-  unsigned int cstb = src1 & 0x1f;
-  return (src2 << csta) >> cstb;
-}
-
-
-static inline unsigned long long _hi(double src) {
-	unsigned long long v;
-	*(double*)&v = src;
-	return v>>32;
-}
-
-static inline unsigned int _hill (long long src)
-{
-  return (unsigned int) (src >> 32);
-}
-
-
-
-static inline double _itod(unsigned hi, unsigned lo) {
-	double v;
-	unsigned long long ll = ((((unsigned long long)(hi))<<32) | (unsigned long long)((unsigned)lo)); 
-	*((unsigned long long *)&v) = ll;
-	return v;
-}
-
-
-static inline long long _itoll(unsigned int src2, unsigned int src1)
-{
-  return (((long long) src2) << 32) | src1;
-}
-
-
-static inline C6X_COMPAT_LONG40 _labs(C6X_COMPAT_LONG40 src2)
-{
-  long long maxv = (1LL << (40 -1)) - 1;
-  long long minv = (-1LL << (40 - 1));
-  C6X_COMPAT_LONG40 lres =  orig_L40_set(src2);
-
-  lres = lres < 0 ? -lres : lres;
-  if (lres > maxv) lres = maxv;
-  else if (lres < minv) lres = minv;
-  
-  return lres;
-}
-
-
-static inline C6X_COMPAT_LONG40 _ldotp2(int src1, int src2) {
-return (C6X_COMPAT_LONG40) _dotp2(src1, src2);
-}
-
-
-static inline unsigned int _lmbd(unsigned int src1, unsigned int src2)
-{
-  return norm_shift_amt_U_and_non_U(0,(((int) (src1 << 31)) >> 31) ^ (~src2));
-}
-
-
-static inline unsigned int _lnorm(C6X_COMPAT_LONG40 src2) {
-if (src2 == 0)
-    return 39;
-  else {
-    int hi = (int)(src2 >> 32);
-    int lo = (int)src2;
-    
-
-    long long temp = (unsigned long long)(unsigned)lo | (unsigned long long)hi << 32;
-    temp = orig_L40_set(temp);
-
-    if (temp == 0) return 0;
-    int cnt = 0;
-    while (((temp >> 39) & 1) == ((temp >> 38) & 1)) {
-       temp <<= 1;
-       cnt++;
-       }
-  return cnt;
-  }
-}
-
-
-static inline unsigned long long _lo(double src) {
-	unsigned long long v;
-	*(double*)&v = src;
-	return v;
-}
-
-
-static inline unsigned int _loll (long long src)
-{
-  return (unsigned int) src;
-}
-
-
-static inline C6X_COMPAT_LONG40 _lsadd(int src1, C6X_COMPAT_LONG40 src2)
-{
-  long long maxv = (1LL << (40 -1)) - 1;
-  long long minv = (-1LL << (40 - 1));
-  int hi = (int)(src2 >> 32);
-  int lo = (int)src2;
-  long long src2_int =  (unsigned long long)(unsigned)lo | (unsigned long long)hi << 32;
-
-
-  long long src2_int2 =  orig_L40_set(src2_int);
-  
-  long long res = src1 + src2_int2;
-
-  if (res > maxv) { 
-	res = maxv;
-	_overflow = 1;
-	}
-  else if (res < minv) {
-	res = minv;
-	_overflow = 1;
-	}
-
-  long long res2 = orig_L40_set(res);
-
-  res2 = (signed char)(res2 >> 32);
-  
-  C6X_COMPAT_LONG40 lres = (((C6X_COMPAT_LONG40) res2) << 32) | ((unsigned int)res);
-  return lres;
-}
-
-
-
-static inline int _max2 (int src1, int src2) {
-	short s1[2], s2[2], r[2];
-	int result;
-	*((int*)s1) = src1;
-	*((int*)s2) = src2;
-	r[0] = s1[0] > s2[0] ? s1[0] : s2[0];
-	r[1] = s1[1] > s2[1] ? s1[1] : s2[1];
-	result = *(int*)r;
-	return result;
-}
-
-
-
-
-
-
-static inline unsigned int _maxu4(unsigned int src1, unsigned int src2) {
-  unsigned int res0, res1, res2, res3;
-  unsigned int s1_0 = res0 = (src1 & 0xFF);
-  unsigned int s1_1 = res1 = (src1 & 0xFF00) >> 8;
-  unsigned int s1_2 = res2 = (src1 & 0xFF0000) >> 16;
-  unsigned int s1_3 = res3 = (src1 & 0xFF000000) >> 24;
-
-  unsigned int s2_0 = (src2 & 0xFF);
-  unsigned int s2_1 = (src2 & 0xFF00) >> 8;
-  unsigned int s2_2 = (src2 & 0xFF0000) >> 16;
-  unsigned int s2_3 = (src2 & 0xFF000000) >> 24;
-
-//  unsigned int res = 0;
-
-  if (s1_0 < s2_0)
-    res0 = s2_0;
-
-  if (s1_1 < s2_1)
-    res1 = s2_1;
-
-  if (s1_2 < s2_2)
-    res2 = s2_2;
-
-  if (s1_3 < s2_3)
-    res3 = s2_3;
-
-  return (res3 << 24) | (res2 << 16) | (res1 << 8) | res0;
-
-
-}
-
-static inline int _min2(int src1, int src2) {
-	short s1[2], s2[2], r[2];
-	int result;
-	*((int*)s1) = src1;
-	*((int*)s2) = src2;
-	r[0] = s1[0] < s2[0] ? s1[0] : s2[0];
-	r[1] = s1[1] < s2[1] ? s1[1] : s2[1];
-	result = *(int*)r;
-	return result;
-}
-
-
-static inline unsigned int _minu4(unsigned int src1, unsigned int src2) {
-unsigned int res0, res1, res2, res3;
-  unsigned int s1_0 = res0 = (src1 & 0xFF);
-  unsigned int s1_1 = res1 = (src1 & 0xFF00) >> 8;
-  unsigned int s1_2 = res2 = (src1 & 0xFF0000) >> 16;
-  unsigned int s1_3 = res3 = (src1 & 0xFF000000) >> 24;
-
-  unsigned int s2_0 = (src2 & 0xFF);
-  unsigned int s2_1 = (src2 & 0xFF00) >> 8;
-  unsigned int s2_2 = (src2 & 0xFF0000) >> 16;
-  unsigned int s2_3 = (src2 & 0xFF000000) >> 24;
-
-//  unsigned int res = 0;
-
-  if (s1_0 > s2_0)
-    res0 = s2_0;
-
-  if (s1_1 > s2_1)
-    res1 = s2_1;
-
-  if (s1_2 > s2_2)
-    res2 = s2_2;
-
-  if (s1_3 > s2_3)
-    res3 = s2_3;
-
-  return (res3 << 24) | (res2 << 16) | (res1 << 8) | res0;
-}
-
-
-static inline int _mpy(int src1, int src2) {
-return (short) src1 * (short) src2;
-}
-
-
-static inline int _mpyh(int src1, int src2) {
-return (short) (src1 >> 16) * (short) (src2 >> 16);
-}
-
-
-static inline long long _mpyhill (int src1,  int src2)
-{
-  short s1 = src1 >> 16;
-  return ((long long) src2) * s1;
-}
-
-static inline int _mpyhir(int src1, int src2)
-{
-  short s1 = src1 >> 16;
-  long long result = ((long long) src2) * s1 + (1 << 14);
-  result >>= 15;
-  return result;
-}
-
-
-static inline int _mpyhl(int src1, int src2) {
-return (short) (src1 >> 16) * (short) (src2);
-}
-
-static inline unsigned int _mpyhlu(unsigned int src1, unsigned int src2) {
-return (unsigned short) (src1 >> 16) * (unsigned short) (src2);
-}
-
-static inline int _mpyhslu(int src1, unsigned int src2) {
-return (short) (src1 >> 16) * (unsigned short) src2;
-}
-
-
-static inline int _mpyhsu(int src1, unsigned int src2) {
-return (short) (src1 >>16) * (unsigned short) (src2 >>16);
-}
-
-
-static inline unsigned int _mpyhu(unsigned int src1, unsigned int src2) {
-return (unsigned short) (src1 >>16) * (unsigned short) (src2 >> 16);
-}
-
-
-static inline int _mpyhuls(unsigned int src1, int src2) {
-return (unsigned short) (src1 >>16) * (signed short) (src2);
-}
-
-
-static inline int _mpyhus(unsigned int src1, int src2) {
-return (unsigned short) (src1 >> 16) * (short) (src2 >>16);
-}
-
-
-
-static inline long long _mpyidll (int src1, int src2)
-{
-  return (long long) src1 * src2;
-}
-
-
-static inline int _mpylh(int src1, int src2) {
-return (signed short) (src1 & 0xffff) * (signed short) (src2 >> 16);
-}
-
-static inline unsigned int _mpylhu(unsigned int src1, unsigned int src2) {
-return (unsigned short) src1 * (unsigned short) (src2 >> 16);
-}
-
-
-static inline long long _mpylill (int src1,  int src2)
-{
-  return ((long long) src2) * ((short)src1);
-}
-
-
-
-static inline int _mpylir(int src1, int src2)
-{
-  short s1 = src1;
-  long long result = ((long long) src2) * s1 + (1 << 14);
-  result >>= 15;
-  return result;
-}
-
-
-static inline int _mpylshu(int src1, unsigned int src2) {
-return (short) src1 * (unsigned short) (src2 >> 16);
-}
-
-
-static inline int _mpyluhs(unsigned int src1, int src2) {
-return (unsigned short) src1 * (short) (src2 >> 16);
-}
-
-
-
-static inline int _mpysu(int src1, unsigned int src2) {
-return (short) src1 * (unsigned short) src2;
-}
-
-
-
-static inline long long _mpysu4ll (int src1,  unsigned int src2) {
-  unsigned short res0, res1, res2, res3;
-  signed char s1_0 = (src1 & 0xff);
-  signed char s1_1 = (src1 & 0xff00) >> 8;
-  signed char s1_2 = (src1 & 0xff0000) >> 16;
-  signed char s1_3 = (src1 & 0xff000000) >> 24;
-
-  unsigned short s2_0 = (src2 & 0xff);
-  unsigned short s2_1 = (src2 & 0xff00) >> 8;
-  unsigned short s2_2 = (src2 & 0xff0000) >> 16;
-  unsigned short s2_3 = (src2 & 0xff000000) >> 24;
-
-  res0 = s1_0 * s2_0;
-  res1 = s1_1 * s2_1;
-  res2 = s1_2 * s2_2;
-  res3 = s1_3 * s2_3;
-
-  return (((unsigned long long) res3) << 48)
-    | (((unsigned long long) res2) << 32)
-    | (((unsigned long long) res1) << 16)
-    | res0;
-}
-
-static inline unsigned int _mpyu(unsigned int src1, unsigned int src2) {
-	unsigned v = (unsigned short)src1 * (unsigned short)src2;
-	return v;
-}
-
-static inline int _mpyus(unsigned int src1, int src2) {
-return (unsigned short) src1 * (short) src2;
-}
-
-static inline long long _mpyu4ll (unsigned int src1,  unsigned int src2) {
-  unsigned short res0, res1, res2, res3;
-  unsigned char s1_0 = (src1 & 0xff);
-  unsigned char s1_1 = (src1 & 0xff00) >> 8;
-  unsigned char s1_2 = (src1 & 0xff0000) >> 16;
-  unsigned char s1_3 = (src1 & 0xff000000) >> 24;
-
-  unsigned short s2_0 = (src2 & 0xff);
-  unsigned short s2_1 = (src2 & 0xff00) >> 8;
-  unsigned short s2_2 = (src2 & 0xff0000) >> 16;
-  unsigned short s2_3 = (src2 & 0xff000000) >> 24;
-
-  res0 = s1_0 * s2_0;
-  res1 = s1_1 * s2_1;
-  res2 = s1_2 * s2_2;
-  res3 = s1_3 * s2_3;
-
-  return (((unsigned long long) res3) << 48)
-    | (((unsigned long long) res2) << 32)
-    | (((unsigned long long) res1) << 16)
-    | res0;
-}
-
-
-static inline long long _mpy2ir(unsigned int src1, unsigned int src2)
-{
-  if ((src1 == 0x8000) && (src2 == 0x80000000)) {
-    _overflow = 1;
-    return 0;
-  }
-  else {
-    short ls1 = src1 & 0xffff;
-    short hs1 = src1 >> 16;
-    unsigned long long hi = (((long long) hs1) * (int) src2 + (1 << 14)) >> 15;
-    unsigned long long lo = ((((long long) ls1) * (int) src2 + (1 << 14)) >> 15) & 0xFFFFFFFF;
-    return (hi << 32) | lo;
-  }
-}
-
-
-static inline long long _mpy2ll (int src1,  int src2) {
-  short ls1 = src1 & 0xffff;
-  short hs1 = src1 >> 16;
-  short ls2 = src2 & 0xffff;
-  short hs2 = src2 >> 16;
-
-  unsigned long long hi = hs1 * hs2;
-  unsigned long long lo = (ls1 * ls2) & 0xFFFFFFFF;
-
-  return (hi << 32) | lo;
-  
-}
-
-
-static inline int _mpy32(int src1, int src2)
-{
-  return src1 * src2;
-}
-
-
-static inline long long _mpy32ll(int src1, int src2)
-{
-  return ((long long) src1) * src2;
-}
-
-static inline long long _mpy32su(int src1, unsigned int src2)
-{
-  return ((long long) src1) * ((int) src2);
-}
-
-static inline long long _mpy32u(unsigned int src1, unsigned int src2)
-{
-  return ((long long) ((int) src1)) * ((long long) ((int) src2));
-}
-
-static inline long long _mpy32us(unsigned int src1, int src2)
-{
-  return ((int) src1) * ((long long) src2);
-}
-
-static inline int _mvd (int src2)
-{
-  return src2;
-}
-
-
-static inline unsigned int _norm(int src2)
-{
-  return norm_shift_amt_U_and_non_U(1,src2);
-}
-
-
-static inline unsigned int _pack2 (unsigned int src1, unsigned int src2) {
-	short s1[2], s2[2], r[2];
-	int result;
-	*((int*)s1) = src1;
-	*((int*)s2) = src2;
-	r[0] = s2[0];
-	r[1] = s1[0];
-	result = *(int*)r;
-	return result;
-}
-
-
-static inline int _packh2 (unsigned int src1, unsigned int src2) {
-	unsigned v0 = src1 & 0xffff0000;
-	unsigned v1 = src2 >> 16;
-	unsigned v = v0|v1;
-	return v;
-	
-}
-
-static inline unsigned int _packh4 (unsigned int src1, unsigned int src2) {
-	unsigned v3 = (src1 >> 24) & 0xff;
-	unsigned v2 = (src1 >> 8) & 0xff;
-	unsigned v1 = (src2 >> 24) & 0xff;
-	unsigned v0 = (src2 >> 8) & 0xff;
-	unsigned v = (v3<<24) | (v2<<16) | (v1 << 8) | v0;
-	return v;
-}
-
-static inline unsigned int _packhl2 (unsigned int src1,  unsigned int src2) {
-	unsigned v0 = src1 & 0xffff0000;
-	unsigned v1 = src2 & 0x0000ffff;
-	unsigned v = v0|v1;
-	return v;
-}
-
-static inline unsigned int _packlh2 (unsigned int src1,  unsigned int src2) {
-	unsigned v0 = src1 << 16;
-	unsigned v1 = (src2 >> 16) & 0xffff;
-	unsigned v = v0|v1;
-	return v;
-}
-
-
-
-
-static inline unsigned int _packl4 (unsigned int src1, unsigned int src2) {
-	unsigned v3 = (src1 >> 16) & 0xff;
-	unsigned v2 = (src1) & 0xff;
-	unsigned v1 = (src2 >> 16) & 0xff;
-	unsigned v0 = (src2) & 0xff;
-	unsigned v = (v3<<24) | (v2<<16) | (v1 << 8) | v0;
-	return v;
-}
-
-
-
-
-static inline unsigned int _rpack2 (unsigned int src1, unsigned int src2) {
-int s1 = (int) src1;
-int s2 = (int) src2;
-s1 = util_shift_right_saturate_n (s1, -1, 32);
-s2 = util_shift_right_saturate_n (s2, -1, 32);
-return (unsigned int) (s1 & 0xffff0000) | (unsigned int) ((s2 & 0xffff0000) >>16);
-}
-
-
-static inline unsigned int _rotl (unsigned int src1, unsigned int src2)
-{
-  src2 &= 0x1f;
-  return (src1 << src2) | (src1 >> (32 - src2));
-}
-
-
-static inline int _sadd(int src1, int src2) {
-signed long long res;
-signed long long maxv, minv;
-maxv = (1LL << (32-1)) - 1;
-minv = (-1LL << (32-1));
-res = (long long) src1 + (long long) src2;
-if (res > maxv) {
-	res = maxv;
-	_overflow = 1;
-	}
-else if (res < minv ) {
-	res = minv;
-	_overflow = 1;
-	}
-return (int) res;
-}
-
-static inline long long _saddsub(unsigned int src1, unsigned int src2) {
-int radd;
-signed long long rsub;
-
-signed long long maxv, minv;
-maxv = (1LL << (32-1)) - 1;
-minv = (-1LL << (32-1));
-
-radd = (int) src1 + (int) src2;
-
-//   saturate on subtract, not add
-
-
-rsub = (long long) ((int) src1) - (long long) ((int) src2);
-if (rsub > maxv) {
-	rsub = maxv;
-        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
-	/*  _overflow = 1; */
-	}
-else if (rsub < minv ) {
-	rsub = minv;
-        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
-	/*  _overflow = 1; */
-	}
-
-return  (((unsigned long long) radd) << 32) |  ( rsub & 0x00000000ffffffff ) ;
-}
-
-
-
-static inline long long _saddsub2(unsigned int src1, unsigned int src2) {
-signed int radd[2];
-signed int rsub[2];
-signed short s1[2], s2[2];
-
-signed int maxv, minv;
-maxv = (1L << (16-1)) - 1;
-minv = (-1L << (16-1));
-
-*((int*)s1) = src1;
-*((int*)s2) = src2;
-
-radd[0] =  (int) s1[0] + (int) s2[0];
-radd[1] =  (int) s1[1] + (int) s2[1];
-
-rsub[0] =  (int) s1[0] - (int) s2[0];
-rsub[1] =  (int) s1[1] - (int) s2[1];
-
-if (radd[0] > maxv) {
-	radd[0] = maxv;
-        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
-	/*  _overflow = 1; */
-	}
-else if (radd[0] < minv ) {
-	radd[0] = minv;
-        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
-	/*  _overflow = 1; */
-	}
-
-if (radd[1] > maxv) {
-	radd[1] = maxv;
-        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
-	/*  _overflow = 1; */
-	}
-else if (radd[1] < minv ) {
-	radd[1] = minv;
-        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
-	/*  _overflow = 1; */
-	}
-
-
-if (rsub[0] > maxv) {
-	rsub[0] = maxv;
-        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
-	/*  _overflow = 1; */
-	}
-else if (rsub[0] < minv ) {
-	rsub[0] = minv;
-        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
-	/*  _overflow = 1; */
-	}
-
-if (rsub[1] > maxv) {
-	rsub[1] = maxv;
-        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
-	/*  _overflow = 1; */
-	}
-else if (rsub[1] < minv ) {
-	rsub[1] = minv;
-        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
-	/*  _overflow = 1; */
-	}
-
-
-return  ((((unsigned long long) radd[1]) & 0x000000000000ffff) << 48) | 
-        ((((unsigned long long) radd[0]) & 0x000000000000ffff) << 32) | 
-        ((((unsigned long long) rsub[1]) & 0x000000000000ffff) << 16) |
-        ((((unsigned long long) rsub[0]) & 0x000000000000ffff));
-}
-
-
-
-static inline  int _sadd2(int src1, int src2) {
-signed short s1[2], s2[2];
-signed int r[2], maxv, minv;
-
-maxv = (1L << (16-1)) - 1;
-minv = (-1L << (16-1));
-
-
-*((int*)s1) = src1;
-*((int*)s2) = src2;
-
-r[0] =  (int) s1[0] + (int) s2[0];
-r[1] =  (int) s1[1] + (int) s2[1];
-
-if (r[0] > maxv) {
-	r[0] = maxv;
-        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
-	/*  _overflow = 1; */
-	}
-else if (r[0] < minv ) {
-	r[0] = minv;
-        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
-	/*  _overflow = 1; */
-	}
-if (r[1] > maxv) {
-	r[1] = maxv;
-        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
-	/*  _overflow = 1; */
-	}
-else if (r[1] < minv ) {
-	r[1] = minv;
-        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
-	/*  _overflow = 1; */
-	}
-
-return ((r[1] & 0xffff) << 16 ) | (r[0] & 0xffff) ;
-}
-
-
-static inline int _saddus2(unsigned int src1, int src2) {
-int res0, res1;
-  unsigned int s1_0 = (src1 & 0xffff);
-  unsigned int s1_1 = (src1 & 0xffff0000) >> 16;
-
-  short s2_0 = (src2 & 0xffff);
-  short s2_1 = (src2 & 0xffff0000) >> 16;
-
-  res0 = s1_0 + s2_0;
-  res1 = s1_1 + s2_1;
-
-  if (res0 >= 0x10000)
-    res0 = 0xffff;
-  else if (res0 < 0)
-    res0 = 0;
-  
-  if (res1 >= 0x10000)
-    res1 = 0xffff;
-  else if (res1 < 0)
-    res1 = 0;
-  
-  return (res1 << 16) | res0;
-}
-
-
-static inline unsigned int _saddu4(unsigned int src1, unsigned int src2) {
-unsigned int res0, res1, res2, res3;
-  unsigned int s1_0 = (src1 & 0xff);
-  unsigned int s1_1 = (src1 & 0xff00) >> 8;
-  unsigned int s1_2 = (src1 & 0xff0000) >> 16;
-  unsigned int s1_3 = (src1 & 0xff000000) >> 24;
-
-  unsigned int s2_0 = (src2 & 0xff);
-  unsigned int s2_1 = (src2 & 0xff00) >> 8;
-  unsigned int s2_2 = (src2 & 0xff0000) >> 16;
-  unsigned int s2_3 = (src2 & 0xff000000) >> 24;
-
-  res0 = s1_0 + s2_0;
-  res1 = s1_1 + s2_1;
-  res2 = s1_2 + s2_2;
-  res3 = s1_3 + s2_3;
-
-  if (res0 >= 0x100)
-    res0 = 0xff;
-  
-  if (res1 >= 0x100)
-    res1 = 0xff;
-  
-  if (res2 >= 0x100)
-    res2 = 0xff;
-  
-  if (res3 >= 0x100)
-    res3 = 0xff;
-
-  return (res3 << 24) | (res2 << 16) | (res1 << 8) | res0;
-
-}
-
-
-
-static inline int _sat(C6X_COMPAT_LONG40 src2)
-{
-  long long maxv = (1LL << (32-1)) - 1;
-  long long minv = (-1LL << (32-1));
-
-  int hi = (int)(src2 >> 32);
-  int lo = (int)src2;
-  long long temp = (unsigned long long)(unsigned)lo | (unsigned long long)hi << 32;
-  temp = orig_L40_set(temp);
-  
-  if (temp > maxv) {
-	temp = maxv;
-	_overflow = 1;
-	}
-  else if (temp < minv) {
-	temp = minv;
-	_overflow = 1;
-	}
-  return (int) temp;
-}
-
-static inline unsigned int _set(unsigned int src2, unsigned int csta, unsigned int cstb)
-{
-  csta &= 0x1f;
-  cstb &= 0x1f;
-  if (csta > cstb)
-    return src2;
-  else {
-    unsigned int mask =  (((1 <<  (cstb -  csta)) <<  1) -  1) <<  csta;
-    return src2 | mask;
-  }
-}
-
-static inline unsigned int _setr(unsigned int src2, int src1)
-{
-  unsigned int csta = (src1 >> 5) & 0x1f;
-  unsigned int cstb = src1 & 0x1f;
-  if (csta > cstb)
-    return src2;
-  else {
-    unsigned int mask =  (((1 <<  (cstb -  csta)) <<  1) -  1) <<  csta;
-    return src2 | mask;
-  }
-}
-
-
-static inline unsigned int _shfl (unsigned int src2)
-{
-  unsigned short lo = src2;
-  unsigned short hi = src2 >> 16;
-  unsigned int result = 0;
-  int i;
-  for (i = 0; i < 32; i+= 2) {
-    result >>= 1;
-    result |= (lo & 0x1) << 31;
-    lo >>= 1;
-    result >>= 1;
-    result |= (hi & 0x1) << 31;
-    hi >>= 1;
-  }
-  return result;
-}
-
-static inline long long _shfl3 (unsigned int src1, unsigned int src2)
-{
-  unsigned short lo = src2;
-  unsigned short hi = src1 >> 16;
-  unsigned short mid = src1;
-  unsigned long long result = 0;
-  int i;
-  for (i = 0; i < 32; i+= 2) {
-    result >>= 1;
-    result |= ((unsigned long long) (lo & 0x1)) << 47;
-    lo >>= 1;
-    result >>= 1;
-    result |= ((unsigned long long) (mid & 0x1)) << 47;
-    mid >>= 1;
-    result >>= 1;
-    result |= ((unsigned long long) (hi & 0x1)) << 47;
-    hi >>= 1;
-  }
-  return result;
-}
-
-
-
-static inline unsigned int _shlmb (unsigned int src1, unsigned int src2)
-{
-  return (src2 << 8) | (src1 >> 24);
-}
-
-static inline unsigned int _shrmb (unsigned int src1, unsigned int src2)
-{
-  return (src2 >> 8) | (src1 << 24);
-}
-
-
-static inline unsigned int _shru2 (unsigned int src1, unsigned int src2) {
-unsigned short hs1 = src1 >> 16;
-  unsigned short ls1 = src1 & 0xFFFF;
-  hs1 >>= src2;
-  ls1 >>= src2;
-  return (hs1 << 16) | ls1;
-}
-
-
-static inline int _shr2 (int src1, unsigned int src2) {
-  short s1[2], result[2];
-  *((int*)s1) = src1;
-  src2 = src2 & 31;
-  result[0] = (int)s1[0] >> src2;
-  result[1] = (int)s1[1] >> src2;
-
-  return *(int*)result;
-}
-
-
-static inline int _smpy (int src1, int src2) {
-unsigned long long result;
-result =  (((short) src1 * (short) src2) << 1);
-
-if ((result & 0xffffffff) == 0x80000000){
-    result = 0x7fffffff;
-    _overflow = 1;
-  }
-return (int) (result);
-}
-
-static inline int _smpyh (int src1, int src2) {
-unsigned long long result;
-result =  ((short) (src1 >> 16) * (short) (src2 >> 16)) << 1;
-if ((result & 0xffffffff) == 0x80000000){
-    result = 0x7fffffff;
-    _overflow = 1;
-  }
-return (int) (result);
-}
-
-static inline int _smpyhl (int src1, int src2) {
-unsigned long long result;
-result = ((short) (src1 >> 16) * (short) (src2)) << 1;
-if ((result & 0xffffffff) == 0x80000000){
-    result = 0x7fffffff;
-    _overflow = 1;
-  }
-return (int) (result);
-}
-
-static inline int _smpylh (int src1, int src2) {
-unsigned long long result;
-result = ((short) (src1) * (short) (src2 >> 16)) << 1;
-if ((result & 0xffffffff) == 0x80000000){
-    result = 0x7fffffff;
-    _overflow = 1;
-  }
-return (int) (result);
-}
-
-static inline long long _smpy2ll (int src1,  int src2) {
-  short ls1 = src1 & 0XFFFF;
-  short hs1 = src1 >> 16;
-  short ls2 = src2 & 0XFFFF;
-  short hs2 = src2 >> 16;
-
-  unsigned long long hi = (hs1 * hs2) << 1;
-  unsigned long long lo = ((ls1 * ls2) << 1) & 0xFFFFFFFF;
-  if ((hi & 0xffffffff) == 0x80000000){
-    hi = 0x7fffffff;
-    _overflow = 1;
-  }
-
-  if ((lo & 0xffffffff) == 0x80000000){
-    lo = 0x7fffffff;
-    _overflow = 1;
-  }
-
-  return (hi << 32) | lo;
-}
-
-
-
-
-static inline int _smpy32(int src1, int src2)
-{
-  long long res = (long long) src1 * src2;
-  res <<= 1;
-  res >>= 32;
-  return res;
-}
-
-static inline unsigned char TEN_satu8 (short src)
-{
-  if (src > 0xff)
-    return 0xff;
-  else if (src < 0)
-    return 0;
-  else
-    return src;
-}
-
-static inline int _spack2 (int src1, int src2) {
-short s1 = (short) util_saturate_n_no_state(src1,16);
-short s2 = (short) util_saturate_n_no_state(src2,16);
-return  ( (unsigned int) s1 << 16) | (((int) s2) & 0xFFFF);
-}
-
-
-static inline unsigned int _spacku4 (int src1, int src2) {
-  short lolo = src2;
-  short lohi = src2 >> 16;
-  short hilo = src1;
-  short hihi = src1 >> 16;
-
-  lolo = TEN_satu8(lolo);
-  lohi = TEN_satu8(lohi);
-  hilo = TEN_satu8(hilo);
-  hihi = TEN_satu8(hihi);
-
-  return (((unsigned int) hihi) <<  24) | (((unsigned int) hilo) << 16) | (lohi << 8) | lolo;
-}
-
-
-
-static inline int _sshl (int src1, unsigned int src2) {
-short local2 = (short)(src2 & 0x7FFF);
-return (int) util_shift_right_saturate_n(src1, -local2, 32);
-}
-
-
-
-
-static inline int _sshvl (int src2, int src1) {
-  short s1;
-  if (src1 > 31)
-    s1 = 31;
-  else if (src1 < -31)
-    s1 = -31;
-  else
-    s1 = src1;
-
-  return (int) util_shift_right_saturate_n(src2, -s1, 32);
-}
-
-
-
-
-
-static inline int _sshvr (int src2, int src1) {
-short s1;
-  if (src1 > 31)
-    s1 = 31;
-  else if (src1 < -31)
-    s1 = -31;
-  else
-    s1 = src1;
-  return (int) util_shift_right_saturate_n(src2, s1, 32);
-}
-
-
-
-
-static inline int _ssub(int src1, int src2) {
-signed long long res;
-signed long long maxv, minv;
-maxv = (1LL << (32-1)) - 1;
-minv = (-1LL << (32-1));
-res = (long long) src1 - (long long) src2;
-if (res > maxv) {
-	res = maxv;
-	_overflow = 1;
-	}
-else if (res < minv ) {
-	res = minv;
-	_overflow = 1;
-	}
-return (int) res;
-}
-
-static inline int _ssub2(int src1, int src2) {
-signed short s1[2], s2[2];
-signed int r[2], maxv, minv;
-
-maxv = (1L << (16-1)) - 1;
-minv = (-1L << (16-1));
-
-
-*((int*)s1) = src1;
-*((int*)s2) = src2;
-
-r[0] =  (int) s1[0] - (int) s2[0];
-r[1] =  (int) s1[1] - (int) s2[1];
-
-if (r[0] > maxv) {
-	r[0] = maxv;
-        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
-	/*  _overflow = 1; */
-	}
-else if (r[0] < minv ) {
-	r[0] = minv;
-        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
-	/*  _overflow = 1; */
-	}
-if (r[1] > maxv) {
-	r[1] = maxv;
-        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
-	/*  _overflow = 1; */
-	}
-else if (r[1] < minv ) {
-	r[1] = minv;
-        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
-	/*  _overflow = 1; */
-	}
-
-return ((r[1] & 0xffff) << 16 ) | (r[0] & 0xffff) ;
-}
-
-
-static inline int _subabs4 (int src1, int src2) {
-  int res0, res1, res2, res3;
-  unsigned int s1_0 = (src1 & 0xff);
-  unsigned int s1_1 = (src1 & 0xff00) >> 8;
-  unsigned int s1_2 = (src1 & 0xff0000) >> 16;
-  unsigned int s1_3 = (src1 & 0xff000000) >> 24;
-
-  unsigned int s2_0 = (src2 & 0xff);
-  unsigned int s2_1 = (src2 & 0xff00) >> 8;
-  unsigned int s2_2 = (src2 & 0xff0000) >> 16;
-  unsigned int s2_3 = (src2 & 0xff000000) >> 24;
-
-  res0 = s1_0 - s2_0;
-  res1 = s1_1 - s2_1;
-  res2 = s1_2 - s2_2;
-  res3 = s1_3 - s2_3;
-
-  if (res0 < 0)
-    res0 = -res0;
-  
-  if (res1 < 0)
-    res1 = -res1;
-  
-  if (res2 < 0)
-    res2 = -res2;
-  
-  if (res3 < 0)
-    res3 = -res3;
-
-  return (res3 << 24) | (res2 << 16) | (res1 << 8) | res0;
-}
-
-
-static inline unsigned int _subc (unsigned int src1, unsigned int src2)
-{
-  if ( src1 >=  src2)
-    return ((src1 - src2) <<  1) +  1;
-  else
-    return src1 << 1;
-}
-
-
-
-static inline int _sub2(int src1, int src2) {
-	short s1[2], s2[2], r[2];
-	int result;
-	*((int*)s1) = src1;
-	*((int*)s2) = src2;
-	r[0] = s1[0] - s2[0];
-	r[1] = s1[1] - s2[1];
-	result = *(int*)r;
-	return result;
-}
-
-
-static inline int _sub4(int src1, int src2) {
-	char c1[4], c2[4], r[4];
-	int result;
-	*((int*)c1) = src1;
-	*((int*)c2) = src2;
-	r[0] = c1[0] - c2[0];
-	r[1] = c1[1] - c2[1];
-	r[2] = c1[2] - c2[2];
-	r[3] = c1[3] - c2[3];
-	result = *(int*)r;
-	return result;
-}
-
-
-static inline int _swap4 (unsigned int src1) {
-	unsigned char v0 = src1;
-	unsigned char v1 = src1 >> 8;
-	unsigned char v2 = src1 >> 16;
-	unsigned char v3 = src1 >> 24;
-	unsigned v = v0<<8 | v1 | v2<<24 | v3<<16;
-	return v;
-}
-
-static inline unsigned int _unpkhu4 (unsigned int src1) {
-	unsigned v0 = src1>>24;
-	unsigned v1 = (src1>>16) & 0xff;
-	return (v0<<16) | v1;
-}
-
-static inline unsigned int _unpklu4 (unsigned int src1) {
-	unsigned v1 = (src1>>8) & 0xff;
-	unsigned v0 = (src1) & 0xff;
-	return (v1<<16) | v0;
-}
-
-
-
-
-static inline unsigned int _xpnd2 (unsigned int src1) {
-      int v0 = (src1 & 0x1) ? 0x0000ffff : 0x00000000;
-      int v1 = (src1 & 0x2) ? 0xffff0000 : 0x00000000;
-      return v0|v1;
-}
-
-static inline unsigned int _xpnd4 (unsigned int src1) {
-      int v0 = (src1 & 0x1) ? 0x000000ff : 0x00000000;
-      int v1 = (src1 & 0x2) ? 0x0000ff00 : 0x00000000;
-      int v2 = (src1 & 0x4) ? 0x00ff0000 : 0x00000000;
-      int v3 = (src1 & 0x8) ? 0xff000000 : 0x00000000;
-      int r = v0|v1|v2|v3;
-      return r;
-}
-
-
-
-//     end of Implemented in alphabetical order
-
-
-#endif /* __C6X_COMPAT__H */
+/*
+ * Copyright (c) 2006-2010 Tensilica Inc. ALL RIGHTS RESERVED.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef __C6X_COMPAT__H
+#define __C6X_COMPAT__H
+
+/* Unimplemented functions _gmpy, _gmpy4, _xormpy, _lssub, _cmpy, _cmpyr,
+   _cmpyr1, _ddotpl2r, _ddotph2r */
+
+
+typedef long long C6X_COMPAT_LONG40;
+
+
+#define _memd8(a) (*((double*)(a)))
+#define _memd8_const(a) (*((const double*)(a)))
+
+#define _amemd8(a) (*((double*)(a)))
+#define _amemd8_const(a) (*((const double*)(a)))
+
+#define _mem8(a) (*((unsigned long long*)(a)))
+#define _mem8_const(a) (*((const unsigned long long*)(a)))
+
+#define _mem4(a) (*((unsigned*)(a)))
+#define _mem4_const(a) (*((const unsigned*)(a)))
+#define _amem4_const(a) (*((const unsigned*)(a)))
+
+/*  NOTE: To emulate a C6X properly you should define global variables
+    for your Xtensa with these names.  Some of the emulation routines
+    will set these values.  */
+
+extern int _carry;
+extern int _overflow;
+
+//   Utility routines
+
+
+#define TESTBIT(x,n) (((x) >> (n)) & 1)
+
+#define NSA_BITS 32
+
+static inline unsigned int norm_shift_amt_U_and_non_U(int is_signed, int inp) {
+int j=0, k=0;
+int x=inp;
+if (is_signed) {
+    /* Invert signed val if negative */
+    x= TESTBIT(x,(NSA_BITS-1))? ~x: x;
+    x= (x&1)|(x<<1);		/* Shift up to return count-1 */
+    if (x ==0) 
+      return NSA_BITS-1;
+  }
+  if (x ==0) 
+    return NSA_BITS;
+  /* Now count leading zeros */
+  for (j=0, k=NSA_BITS-1; k>=0; j++, k--) {
+    if (TESTBIT(x,k)) 
+      return j;
+  }
+  return NSA_BITS;
+}
+
+
+
+static inline long long
+orig_L40_set( long long L40_var1) {
+   long long L40_var_out;
+   
+   L40_var_out =  L40_var1 & 0x000000ffffffffffLL;
+
+   if( L40_var1 & 0x8000000000LL)
+      L40_var_out = L40_var_out | 0xffffff0000000000LL;
+
+   return( L40_var_out);
+}
+
+
+
+static inline signed long long
+util_saturate_n_no_state(signed long long t, int n)
+{
+  signed long long maxv, minv;
+  maxv = (1LL << (n-1)) - 1;
+  minv = (-1LL << (n-1));
+  if (t > maxv) {
+    t = maxv;
+  } else if (t < minv) {
+    t = minv;
+  }
+  return t;
+}
+
+
+static inline signed long long
+util_saturate_n_sgn(signed long long t, int n)
+{
+  signed long long result;
+  signed long long maxv, minv;
+  maxv = (1LL << (n-1)) - 1;
+  minv = (-1LL << (n-1));
+  if (t > 0) {
+    result = maxv;
+    _overflow = 1;
+  } else if (t < 0) {
+    result = minv;
+    _overflow = 1;
+  } else {
+    result = 0;
+  }
+  return result;
+}
+
+
+
+
+/* well-behaved signed shift right (left on negative) with
+   saturation */
+static inline signed long long
+util_shift_right_saturate_n(signed long long t, int shval, int n)
+{
+  /* n should be <= 62 */
+  long long result;
+
+  signed long long mask;
+  int actual_shift = shval;
+  long long shft = actual_shift > 0 ? actual_shift : -actual_shift;
+
+  if (t == 0 || actual_shift == 0)
+    return t;
+
+  if (actual_shift >= n) {
+    return (t < 0) ? -1 : 0;
+  }
+  if (actual_shift <= -n) {
+    return util_saturate_n_sgn(t, n);
+  }
+  if (actual_shift > 0) {
+    return t >> actual_shift;
+  }
+  /* actual_shift < 0. Check for saturation after shift. */
+  mask = (-1LL << (n-shft-1));
+  if (t > 0 && ((mask & t) != 0)) {
+    return util_saturate_n_sgn(t, n);
+  }
+  if (t < 0 && ((mask & t) != mask)) {
+    return util_saturate_n_sgn(t, n);
+  }
+  result = t << shft;
+
+  return result;
+}
+
+
+/* Implemented c6x standard C compatibility functions (alphabetical
+   order) */
+
+
+static inline int _abs(int src1) {
+	if ((unsigned) src1 == (unsigned) 0x80000000) {
+		return 0x7fffffff;
+	}
+	return abs(src1);
+}
+
+
+static inline int _abs2(int src1) {
+	short s1[2],r[2];
+	int result;
+	*((int*)s1) = src1;
+	if ((unsigned short) s1[1] == (unsigned short) 0x8000) r[1] = 0x7fff;
+	   else r[1] = abs(s1[1]);
+	if ((unsigned short) s1[0] == (unsigned short) 0x8000) r[0] = 0x7fff;
+	   else r[0] = abs(s1[0]);
+	result = *(int*)r;
+	return result;
+	}
+
+
+
+
+static inline int _add2(int src1, int src2) {
+	short s1[2], s2[2], r[2];
+	int result;
+	*((int*)s1) = src1;
+	*((int*)s2) = src2;
+	r[0] = s1[0] + s2[0];
+	r[1] = s1[1] + s2[1];
+	result = *(int*)r;
+	return result;
+}
+
+static inline int _add4(int src1, int src2) {
+	char c1[4], c2[4], r[4];
+	int result;
+	*((int*)c1) = src1;
+	*((int*)c2) = src2;
+	r[0] = c1[0] + c2[0];
+	r[1] = c1[1] + c2[1];
+	r[2] = c1[2] + c2[2];
+	r[3] = c1[3] + c2[3];
+	result = *(int*)r;
+	return result;
+}
+
+
+
+static inline long long _addsub(unsigned int src1, unsigned int src2)
+{
+  
+  int res_lo;
+  int res_hi;
+ 
+  res_hi = src1+src2;
+  res_lo = src1-src2;
+  return (((unsigned long long) res_hi) << 32) | ((unsigned int) res_lo) ;
+}
+
+  
+static inline long long _addsub2(unsigned int src1, unsigned int src2)
+{
+  short s1[2], s2[2], ra[2], rs[2];
+  int res_lo;
+  int res_hi;
+
+  *((int*)s1) = src1;
+  *((int*)s2) = src2;
+  ra[0] = s1[0] + s2[0];
+  ra[1] = s1[1] + s2[1];
+  rs[0] = s1[0] - s2[0];
+  rs[1] = s1[1] - s2[1];
+  
+  res_hi = *(int*)ra;
+  res_lo = *(int*)rs;
+  return (((unsigned long long) res_hi) << 32) | ((unsigned int) res_lo) ;
+}
+
+
+static inline int _avg2(int src1, int src2) {
+  int low = (((int)1 +  (short) src1 + (short) src2) >> 1) & 0XFFFF;
+  int high1 = src1 >> 16;
+  int high2 = src2 >> 16;
+  int high = ((high1 + high2 + 1) >> 1)<< 16;
+  return high | low;
+}
+
+
+
+static inline unsigned int _avgu4(unsigned int src1, unsigned int src2) {
+unsigned int res0 = ((src1 & 0xFF) + (src2 & 0xFF) + 1) >> 1;
+  unsigned int res1 = (((src1 & 0xFF00) >> 8) + ((src2 & 0xFF00) >> 8) + 1) >> 1;
+  unsigned int res2 = (((src1 & 0xFF0000) >> 16) + ((src2 & 0xFF0000) >> 16) + 1) >> 1;
+  unsigned int res3 = (((src1 & 0xFF000000) >> 24) + ((src2 & 0xFF000000) >> 24) + 1) >> 1;
+  return (res3 << 24) | (res2 << 16) | (res1 << 8) | res0;
+}
+
+
+static inline int TEN_popc (unsigned char b)
+{
+  int i, result = 0;
+  for (i = 0; i <  8; i++){
+    if (b & 0x1)
+      result++;
+    b >>= 1;
+  }
+  return result;
+}
+
+static inline unsigned int _bitc4(unsigned int src1)
+{
+  unsigned int res0 = TEN_popc(src1 & 0xFF);
+  unsigned int res1 = TEN_popc((src1 & 0xFF00) >> 8);
+  unsigned int res2 = TEN_popc((src1 & 0xFF0000) >> 16);
+  unsigned int res3 = TEN_popc((src1 & 0xFF000000) >> 24);
+  return (res3 << 24) | (res2 << 16) | (res1 << 8) | res0;
+}
+
+static inline unsigned int _bitr(unsigned int src) {
+	int i;
+	unsigned r = 0;
+	for (i = 0; i< 32; ++i) {
+		r = r | (((src >> i) & 1)<<(31-i));
+	}
+	return r;
+}
+
+
+static inline unsigned int _clr(unsigned int src2,  int csta,  int cstb)
+{
+  csta &= 0x1f;
+  cstb &= 0x1f;
+  if (csta > cstb)
+    return src2;
+  else {
+    unsigned int mask =  (((1 <<  (cstb -  csta)) <<  1) -  1) <<  csta;
+    return src2 & (~mask);
+  }
+}
+
+static inline unsigned int _clrr(unsigned int src2, int src1)
+{
+  unsigned int csta = (src1 >> 5) & 0x1f;
+  unsigned int cstb = src1 & 0x1f;
+  if (csta > cstb)
+    return src2;
+  else {
+    unsigned int mask =  (((1 <<  (cstb -  csta)) <<  1) -  1) <<  csta;
+    return src2 & (~mask);
+  }
+}
+
+
+
+
+static inline int _cmpeq2(int src1, int src2) {
+	short s1[2], s2[2];
+	int r0, r1;
+	int result;
+	*((int*)s1) = src1;
+	*((int*)s2) = src2;
+	r0 = s1[0] == s2[0] ? 1 : 0;
+	r1 = s1[1] == s2[1] ? 1 : 0;
+	result = (r1 << 1) | r0;
+	return result;
+}
+
+static inline int _cmpeq4(int src1, int src2) {
+	char s1[4], s2[4];
+	int r0, r1, r2, r3;
+	int result;
+	*((int*)s1) = src1;
+	*((int*)s2) = src2;
+	r0 = s1[0] == s2[0] ? 1 : 0;
+	r1 = s1[1] == s2[1] ? 1 : 0;
+	r2 = s1[2] == s2[2] ? 1 : 0;
+	r3 = s1[3] == s2[3] ? 1 : 0;
+	result = (r3 << 3) | (r2 << 2) | (r1 << 1) | r0;
+	return result;
+}
+
+
+static inline int _cmpgt2(int src1, int src2) {
+	short s1[2], s2[2];
+	int r1, r0;
+	int result;
+	*((int*)s1) = src1;
+	*((int*)s2) = src2;
+	r0 = s1[0] > s2[0] ? 1 : 0;
+	r1 = s1[1] > s2[1] ? 1 : 0;
+	result = (r1<<1) | r0;
+	return result;
+}
+
+
+static inline unsigned int _cmpgtu4(unsigned int src1, unsigned int src2) {
+  unsigned int s1_0 = (src1 & 0xFF);
+  unsigned int s1_1 = (src1 & 0xFF00) >> 8;
+  unsigned int s1_2 = (src1 & 0xFF0000) >> 16;
+  unsigned int s1_3 = (src1 & 0xFF000000) >> 24;
+
+  unsigned int s2_0 = (src2 & 0xFF);
+  unsigned int s2_1 = (src2 & 0xFF00) >> 8;
+  unsigned int s2_2 = (src2 & 0xFF0000) >> 16;
+  unsigned int s2_3 = (src2 & 0xFF000000) >> 24;
+
+  unsigned int result = 0;
+
+  if (s1_0 > s2_0)
+    result |= 0x1;
+
+  if (s1_1 > s2_1)
+    result |= 0x2;
+
+  if (s1_2 > s2_2)
+    result |= 0x4;
+
+  if (s1_3 > s2_3)
+    result |= 0x8;
+
+  return result;
+}
+
+
+
+
+static inline long long _ddotp4(unsigned int src1, unsigned int src2) {
+  unsigned int res0, res1;
+  short s1_0 = (src1 & 0xffff);
+  short s1_1 = (src1 & 0xfff0000) >> 16;
+
+  unsigned short s2_0 = (src2 & 0xff);
+  unsigned short s2_1 = (src2 & 0xff00) >> 8;
+  unsigned short s2_2 = (src2 & 0xff0000) >> 16;
+  unsigned short s2_3 = (src2 & 0xff000000) >> 24;
+
+  res0 = ((int)s1_0) * s2_0 + ((int)s1_1) * s2_1;
+  res1 = ((int)s1_0) * s2_2 + ((int)s1_1) * s2_3;
+
+  return (res1 << 16) | res0;
+}
+
+
+static inline long long _ddotph2(long long src1_o_src1_e, unsigned int src2)
+{
+
+  unsigned int src1_o = src1_o_src1_e >> 32;  
+  unsigned int src1_e = src1_o_src1_e & 0xFFFFFFFF;  
+  short ls1_o = src1_o & 0XFFFF;
+  short hs1_o = src1_o >> 16;
+//  short ls1_e = src1_e & 0XFFFF;
+  short hs1_e = src1_e >> 16;
+  short ls2 = src2 & 0XFFFF;
+  short hs2 = src2 >> 16;
+
+  unsigned long long res_hi = ls2 * ls1_o + hs2 * hs1_o;
+  unsigned int res_lo = ls1_o * hs2 + hs1_e * ls2;
+  return (res_hi << 32) | res_lo;
+}
+
+
+static inline long long _ddotpl2(long long src1_o_src1_e, unsigned int src2)
+{
+  unsigned int src1_o = src1_o_src1_e >> 32;  
+  unsigned int src1_e = src1_o_src1_e & 0xFFFFFFFF;  
+  short ls1_o = src1_o & 0XFFFF;
+//  short hs1_o = src1_o >> 16;
+  short ls1_e = src1_e & 0XFFFF;
+  short hs1_e = src1_e >> 16;
+  short ls2 = src2 & 0XFFFF;
+  short hs2 = src2 >> 16;
+
+  unsigned long long res_hi = ls2 * hs1_e + hs2 * ls1_o;
+  unsigned res_lo = hs1_e * hs2 + ls1_e * ls2;
+  return (res_hi << 32) | res_lo;
+}
+
+
+static inline unsigned int _deal(unsigned int src)
+{
+  int i;
+  unsigned short lo = 0, hi = 0;
+  for (i = 0; i < 32; i+= 2) {
+    lo >>= 1;
+    lo |= (src & 0x1) << 15;
+    src >>= 1;
+    hi >>= 1;
+    hi |= (src & 0x1) << 15;
+    src >>= 1;
+  }
+  return (hi << 16) | lo;
+}
+
+
+static inline long long _dmv(unsigned int src1, unsigned int src2)
+{
+  return (((long long) src1) << 32) | src2;
+}
+
+
+static inline int _dotpn2(int src1, int src2) {
+short int s1_h = src1>>16;
+	short int s1_l = src1;
+	short int s2_h = src2>>16;
+	short int s2_l = src2;
+	return s1_h * s2_h - s1_l * s2_l;
+}
+
+
+static inline int _dotp2(int src1, int src2) {
+	short int s1_h = src1>>16;
+	short int s1_l = src1;
+	short int s2_h = src2>>16;
+	short int s2_l = src2;
+	return s1_h * s2_h + s1_l * s2_l;
+}
+
+
+
+static inline int _dotpnrsu2(int src1, unsigned int src2)
+{
+  short ls1 = src1 & 0XFFFF;
+  unsigned short ls2 = src2 & 0XFFFF;
+  short hs1 = src1 >> 16;
+  unsigned short hs2 = src2 >> 16;
+
+  int result = (((long long) (int)(hs1 * hs2)) - ((long long) (int)(ls1 * ls2)) +  (1 << 15)) >> 16;
+  return result;
+}
+
+
+
+static inline int _dotprsu2(int src1, unsigned int src2) {
+  short ls1 = src1 & 0XFFFF;
+  unsigned short ls2 =  (src2 & 0XFFFF);
+  short hs1 = src1 >> 16;
+  unsigned short hs2 =  (src2 >> 16);
+
+  int result = (((long long) (int) (ls1 * ls2)) + ((long long) (int) (hs1 * hs2)) +  (1LL << 15)) >> 16;
+  return result;
+}
+
+
+
+
+
+
+
+static inline int _dotpsu4(int src1, unsigned int src2) {
+  int result;
+  signed char s1_0 = (src1 & 0xff);
+  signed char s1_1 = (src1 & 0xff00) >> 8;
+  signed char s1_2 = (src1 & 0xff0000) >> 16;
+  signed char s1_3 = (src1 & 0xff000000) >> 24;
+
+  unsigned int s2_0 = (src2 & 0xff);
+  unsigned int s2_1 = (src2 & 0xff00) >> 8;
+  unsigned int s2_2 = (src2 & 0xff0000) >> 16;
+  unsigned int s2_3 = (src2 & 0xff000000) >> 24;
+
+  result = s1_0 * s2_0 + s1_1 * s2_1 + s1_2 * s2_2 + s1_3 * s2_3;
+  return result;
+}
+
+
+static inline unsigned int _dotpu4(unsigned int src1, unsigned int src2) {
+	unsigned char v1_0 = src1 & 0xff;
+	unsigned char v1_1 = (src1>>8) & 0xff;
+	unsigned char v1_2 = (src1>>16) & 0xff;
+	unsigned char v1_3 = (src1>>24) & 0xff;
+
+	unsigned char v2_0 = src2 & 0xff;
+	unsigned char v2_1 = (src2>>8) & 0xff;
+	unsigned char v2_2 = (src2>>16) & 0xff;
+	unsigned char v2_3 = (src2>>24) & 0xff;
+
+	unsigned v = v1_0 * v2_0  + v1_1 * v2_1 + v1_2 * v2_2 + v1_3 * v2_3;
+	return v;
+}
+
+
+static inline long long _dpack2(unsigned int src1, unsigned int src2){
+unsigned short s1[2], s2[2];
+*((int*)s1) = src1;
+*((int*)s2) = src2;
+return ((unsigned long long) s1[1] << 48) | ((unsigned long long) s2[1] << 32) | ((unsigned long long) s1[0] << 16) | ((unsigned long long) s2[0]);
+}
+
+
+static inline long long _dpackx2(unsigned int src1, unsigned int src2){
+unsigned short s1[2], s2[2];
+*((int*)s1) = src1;
+*((int*)s2) = src2;
+return ((unsigned long long) s2[0] << 48) | ((unsigned long long) s1[1] << 32) | ((unsigned long long) s1[0] << 16) | ((unsigned long long) s2[1]);
+}
+
+static inline int _ext(int src2, unsigned int csta, unsigned int cstb)
+{
+  return (src2 << csta) >> cstb;
+}
+
+static inline int _extr(int src2, int src1)
+{
+  unsigned int csta = (src1 >> 5) & 0x1f;
+  unsigned int cstb = src1 & 0x1f;
+  return (src2 << csta) >> cstb;
+}
+
+static inline unsigned int _extu(unsigned int src2, unsigned int csta, unsigned int cstb)
+{
+  return (src2 << csta) >> cstb;
+}
+
+static inline unsigned int _extur(unsigned int src2, int src1)
+{
+  unsigned int csta = (src1 >> 5) & 0x1f;
+  unsigned int cstb = src1 & 0x1f;
+  return (src2 << csta) >> cstb;
+}
+
+
+static inline unsigned long long _hi(double src) {
+	unsigned long long v;
+	*(double*)&v = src;
+	return v>>32;
+}
+
+static inline unsigned int _hill (long long src)
+{
+  return (unsigned int) (src >> 32);
+}
+
+
+
+static inline double _itod(unsigned hi, unsigned lo) {
+	double v;
+	unsigned long long ll = ((((unsigned long long)(hi))<<32) | (unsigned long long)((unsigned)lo)); 
+	*((unsigned long long *)&v) = ll;
+	return v;
+}
+
+
+static inline long long _itoll(unsigned int src2, unsigned int src1)
+{
+  return (((long long) src2) << 32) | src1;
+}
+
+
+static inline C6X_COMPAT_LONG40 _labs(C6X_COMPAT_LONG40 src2)
+{
+  long long maxv = (1LL << (40 -1)) - 1;
+  long long minv = (-1LL << (40 - 1));
+  C6X_COMPAT_LONG40 lres =  orig_L40_set(src2);
+
+  lres = lres < 0 ? -lres : lres;
+  if (lres > maxv) lres = maxv;
+  else if (lres < minv) lres = minv;
+  
+  return lres;
+}
+
+
+static inline C6X_COMPAT_LONG40 _ldotp2(int src1, int src2) {
+return (C6X_COMPAT_LONG40) _dotp2(src1, src2);
+}
+
+
+static inline unsigned int _lmbd(unsigned int src1, unsigned int src2)
+{
+  return norm_shift_amt_U_and_non_U(0,(((int) (src1 << 31)) >> 31) ^ (~src2));
+}
+
+
+static inline unsigned int _lnorm(C6X_COMPAT_LONG40 src2) {
+if (src2 == 0)
+    return 39;
+  else {
+    int hi = (int)(src2 >> 32);
+    int lo = (int)src2;
+    
+
+    long long temp = (unsigned long long)(unsigned)lo | (unsigned long long)hi << 32;
+    temp = orig_L40_set(temp);
+
+    if (temp == 0) return 0;
+    int cnt = 0;
+    while (((temp >> 39) & 1) == ((temp >> 38) & 1)) {
+       temp <<= 1;
+       cnt++;
+       }
+  return cnt;
+  }
+}
+
+
+static inline unsigned long long _lo(double src) {
+	unsigned long long v;
+	*(double*)&v = src;
+	return v;
+}
+
+
+static inline unsigned int _loll (long long src)
+{
+  return (unsigned int) src;
+}
+
+
+static inline C6X_COMPAT_LONG40 _lsadd(int src1, C6X_COMPAT_LONG40 src2)
+{
+  long long maxv = (1LL << (40 -1)) - 1;
+  long long minv = (-1LL << (40 - 1));
+  int hi = (int)(src2 >> 32);
+  int lo = (int)src2;
+  long long src2_int =  (unsigned long long)(unsigned)lo | (unsigned long long)hi << 32;
+
+
+  long long src2_int2 =  orig_L40_set(src2_int);
+  
+  long long res = src1 + src2_int2;
+
+  if (res > maxv) { 
+	res = maxv;
+	_overflow = 1;
+	}
+  else if (res < minv) {
+	res = minv;
+	_overflow = 1;
+	}
+
+  long long res2 = orig_L40_set(res);
+
+  res2 = (signed char)(res2 >> 32);
+  
+  C6X_COMPAT_LONG40 lres = (((C6X_COMPAT_LONG40) res2) << 32) | ((unsigned int)res);
+  return lres;
+}
+
+
+
+static inline int _max2 (int src1, int src2) {
+	short s1[2], s2[2], r[2];
+	int result;
+	*((int*)s1) = src1;
+	*((int*)s2) = src2;
+	r[0] = s1[0] > s2[0] ? s1[0] : s2[0];
+	r[1] = s1[1] > s2[1] ? s1[1] : s2[1];
+	result = *(int*)r;
+	return result;
+}
+
+
+
+
+
+
+static inline unsigned int _maxu4(unsigned int src1, unsigned int src2) {
+  unsigned int res0, res1, res2, res3;
+  unsigned int s1_0 = res0 = (src1 & 0xFF);
+  unsigned int s1_1 = res1 = (src1 & 0xFF00) >> 8;
+  unsigned int s1_2 = res2 = (src1 & 0xFF0000) >> 16;
+  unsigned int s1_3 = res3 = (src1 & 0xFF000000) >> 24;
+
+  unsigned int s2_0 = (src2 & 0xFF);
+  unsigned int s2_1 = (src2 & 0xFF00) >> 8;
+  unsigned int s2_2 = (src2 & 0xFF0000) >> 16;
+  unsigned int s2_3 = (src2 & 0xFF000000) >> 24;
+
+//  unsigned int res = 0;
+
+  if (s1_0 < s2_0)
+    res0 = s2_0;
+
+  if (s1_1 < s2_1)
+    res1 = s2_1;
+
+  if (s1_2 < s2_2)
+    res2 = s2_2;
+
+  if (s1_3 < s2_3)
+    res3 = s2_3;
+
+  return (res3 << 24) | (res2 << 16) | (res1 << 8) | res0;
+
+
+}
+
+static inline int _min2(int src1, int src2) {
+	short s1[2], s2[2], r[2];
+	int result;
+	*((int*)s1) = src1;
+	*((int*)s2) = src2;
+	r[0] = s1[0] < s2[0] ? s1[0] : s2[0];
+	r[1] = s1[1] < s2[1] ? s1[1] : s2[1];
+	result = *(int*)r;
+	return result;
+}
+
+
+static inline unsigned int _minu4(unsigned int src1, unsigned int src2) {
+unsigned int res0, res1, res2, res3;
+  unsigned int s1_0 = res0 = (src1 & 0xFF);
+  unsigned int s1_1 = res1 = (src1 & 0xFF00) >> 8;
+  unsigned int s1_2 = res2 = (src1 & 0xFF0000) >> 16;
+  unsigned int s1_3 = res3 = (src1 & 0xFF000000) >> 24;
+
+  unsigned int s2_0 = (src2 & 0xFF);
+  unsigned int s2_1 = (src2 & 0xFF00) >> 8;
+  unsigned int s2_2 = (src2 & 0xFF0000) >> 16;
+  unsigned int s2_3 = (src2 & 0xFF000000) >> 24;
+
+//  unsigned int res = 0;
+
+  if (s1_0 > s2_0)
+    res0 = s2_0;
+
+  if (s1_1 > s2_1)
+    res1 = s2_1;
+
+  if (s1_2 > s2_2)
+    res2 = s2_2;
+
+  if (s1_3 > s2_3)
+    res3 = s2_3;
+
+  return (res3 << 24) | (res2 << 16) | (res1 << 8) | res0;
+}
+
+
+static inline int _mpy(int src1, int src2) {
+return (short) src1 * (short) src2;
+}
+
+
+static inline int _mpyh(int src1, int src2) {
+return (short) (src1 >> 16) * (short) (src2 >> 16);
+}
+
+
+static inline long long _mpyhill (int src1,  int src2)
+{
+  short s1 = src1 >> 16;
+  return ((long long) src2) * s1;
+}
+
+static inline int _mpyhir(int src1, int src2)
+{
+  short s1 = src1 >> 16;
+  long long result = ((long long) src2) * s1 + (1 << 14);
+  result >>= 15;
+  return result;
+}
+
+
+static inline int _mpyhl(int src1, int src2) {
+return (short) (src1 >> 16) * (short) (src2);
+}
+
+static inline unsigned int _mpyhlu(unsigned int src1, unsigned int src2) {
+return (unsigned short) (src1 >> 16) * (unsigned short) (src2);
+}
+
+static inline int _mpyhslu(int src1, unsigned int src2) {
+return (short) (src1 >> 16) * (unsigned short) src2;
+}
+
+
+static inline int _mpyhsu(int src1, unsigned int src2) {
+return (short) (src1 >>16) * (unsigned short) (src2 >>16);
+}
+
+
+static inline unsigned int _mpyhu(unsigned int src1, unsigned int src2) {
+return (unsigned short) (src1 >>16) * (unsigned short) (src2 >> 16);
+}
+
+
+static inline int _mpyhuls(unsigned int src1, int src2) {
+return (unsigned short) (src1 >>16) * (signed short) (src2);
+}
+
+
+static inline int _mpyhus(unsigned int src1, int src2) {
+return (unsigned short) (src1 >> 16) * (short) (src2 >>16);
+}
+
+
+
+static inline long long _mpyidll (int src1, int src2)
+{
+  return (long long) src1 * src2;
+}
+
+
+static inline int _mpylh(int src1, int src2) {
+return (signed short) (src1 & 0xffff) * (signed short) (src2 >> 16);
+}
+
+static inline unsigned int _mpylhu(unsigned int src1, unsigned int src2) {
+return (unsigned short) src1 * (unsigned short) (src2 >> 16);
+}
+
+
+static inline long long _mpylill (int src1,  int src2)
+{
+  return ((long long) src2) * ((short)src1);
+}
+
+
+
+static inline int _mpylir(int src1, int src2)
+{
+  short s1 = src1;
+  long long result = ((long long) src2) * s1 + (1 << 14);
+  result >>= 15;
+  return result;
+}
+
+
+static inline int _mpylshu(int src1, unsigned int src2) {
+return (short) src1 * (unsigned short) (src2 >> 16);
+}
+
+
+static inline int _mpyluhs(unsigned int src1, int src2) {
+return (unsigned short) src1 * (short) (src2 >> 16);
+}
+
+
+
+static inline int _mpysu(int src1, unsigned int src2) {
+return (short) src1 * (unsigned short) src2;
+}
+
+
+
+static inline long long _mpysu4ll (int src1,  unsigned int src2) {
+  unsigned short res0, res1, res2, res3;
+  signed char s1_0 = (src1 & 0xff);
+  signed char s1_1 = (src1 & 0xff00) >> 8;
+  signed char s1_2 = (src1 & 0xff0000) >> 16;
+  signed char s1_3 = (src1 & 0xff000000) >> 24;
+
+  unsigned short s2_0 = (src2 & 0xff);
+  unsigned short s2_1 = (src2 & 0xff00) >> 8;
+  unsigned short s2_2 = (src2 & 0xff0000) >> 16;
+  unsigned short s2_3 = (src2 & 0xff000000) >> 24;
+
+  res0 = s1_0 * s2_0;
+  res1 = s1_1 * s2_1;
+  res2 = s1_2 * s2_2;
+  res3 = s1_3 * s2_3;
+
+  return (((unsigned long long) res3) << 48)
+    | (((unsigned long long) res2) << 32)
+    | (((unsigned long long) res1) << 16)
+    | res0;
+}
+
+static inline unsigned int _mpyu(unsigned int src1, unsigned int src2) {
+	unsigned v = (unsigned short)src1 * (unsigned short)src2;
+	return v;
+}
+
+static inline int _mpyus(unsigned int src1, int src2) {
+return (unsigned short) src1 * (short) src2;
+}
+
+static inline long long _mpyu4ll (unsigned int src1,  unsigned int src2) {
+  unsigned short res0, res1, res2, res3;
+  unsigned char s1_0 = (src1 & 0xff);
+  unsigned char s1_1 = (src1 & 0xff00) >> 8;
+  unsigned char s1_2 = (src1 & 0xff0000) >> 16;
+  unsigned char s1_3 = (src1 & 0xff000000) >> 24;
+
+  unsigned short s2_0 = (src2 & 0xff);
+  unsigned short s2_1 = (src2 & 0xff00) >> 8;
+  unsigned short s2_2 = (src2 & 0xff0000) >> 16;
+  unsigned short s2_3 = (src2 & 0xff000000) >> 24;
+
+  res0 = s1_0 * s2_0;
+  res1 = s1_1 * s2_1;
+  res2 = s1_2 * s2_2;
+  res3 = s1_3 * s2_3;
+
+  return (((unsigned long long) res3) << 48)
+    | (((unsigned long long) res2) << 32)
+    | (((unsigned long long) res1) << 16)
+    | res0;
+}
+
+
+static inline long long _mpy2ir(unsigned int src1, unsigned int src2)
+{
+  if ((src1 == 0x8000) && (src2 == 0x80000000)) {
+    _overflow = 1;
+    return 0;
+  }
+  else {
+    short ls1 = src1 & 0xffff;
+    short hs1 = src1 >> 16;
+    unsigned long long hi = (((long long) hs1) * (int) src2 + (1 << 14)) >> 15;
+    unsigned long long lo = ((((long long) ls1) * (int) src2 + (1 << 14)) >> 15) & 0xFFFFFFFF;
+    return (hi << 32) | lo;
+  }
+}
+
+
+static inline long long _mpy2ll (int src1,  int src2) {
+  short ls1 = src1 & 0xffff;
+  short hs1 = src1 >> 16;
+  short ls2 = src2 & 0xffff;
+  short hs2 = src2 >> 16;
+
+  unsigned long long hi = hs1 * hs2;
+  unsigned long long lo = (ls1 * ls2) & 0xFFFFFFFF;
+
+  return (hi << 32) | lo;
+  
+}
+
+
+static inline int _mpy32(int src1, int src2)
+{
+  return src1 * src2;
+}
+
+
+static inline long long _mpy32ll(int src1, int src2)
+{
+  return ((long long) src1) * src2;
+}
+
+static inline long long _mpy32su(int src1, unsigned int src2)
+{
+  return ((long long) src1) * ((int) src2);
+}
+
+static inline long long _mpy32u(unsigned int src1, unsigned int src2)
+{
+  return ((long long) ((int) src1)) * ((long long) ((int) src2));
+}
+
+static inline long long _mpy32us(unsigned int src1, int src2)
+{
+  return ((int) src1) * ((long long) src2);
+}
+
+static inline int _mvd (int src2)
+{
+  return src2;
+}
+
+
+static inline unsigned int _norm(int src2)
+{
+  return norm_shift_amt_U_and_non_U(1,src2);
+}
+
+
+static inline unsigned int _pack2 (unsigned int src1, unsigned int src2) {
+	short s1[2], s2[2], r[2];
+	int result;
+	*((int*)s1) = src1;
+	*((int*)s2) = src2;
+	r[0] = s2[0];
+	r[1] = s1[0];
+	result = *(int*)r;
+	return result;
+}
+
+
+static inline int _packh2 (unsigned int src1, unsigned int src2) {
+	unsigned v0 = src1 & 0xffff0000;
+	unsigned v1 = src2 >> 16;
+	unsigned v = v0|v1;
+	return v;
+	
+}
+
+static inline unsigned int _packh4 (unsigned int src1, unsigned int src2) {
+	unsigned v3 = (src1 >> 24) & 0xff;
+	unsigned v2 = (src1 >> 8) & 0xff;
+	unsigned v1 = (src2 >> 24) & 0xff;
+	unsigned v0 = (src2 >> 8) & 0xff;
+	unsigned v = (v3<<24) | (v2<<16) | (v1 << 8) | v0;
+	return v;
+}
+
+static inline unsigned int _packhl2 (unsigned int src1,  unsigned int src2) {
+	unsigned v0 = src1 & 0xffff0000;
+	unsigned v1 = src2 & 0x0000ffff;
+	unsigned v = v0|v1;
+	return v;
+}
+
+static inline unsigned int _packlh2 (unsigned int src1,  unsigned int src2) {
+	unsigned v0 = src1 << 16;
+	unsigned v1 = (src2 >> 16) & 0xffff;
+	unsigned v = v0|v1;
+	return v;
+}
+
+
+
+
+static inline unsigned int _packl4 (unsigned int src1, unsigned int src2) {
+	unsigned v3 = (src1 >> 16) & 0xff;
+	unsigned v2 = (src1) & 0xff;
+	unsigned v1 = (src2 >> 16) & 0xff;
+	unsigned v0 = (src2) & 0xff;
+	unsigned v = (v3<<24) | (v2<<16) | (v1 << 8) | v0;
+	return v;
+}
+
+
+
+
+static inline unsigned int _rpack2 (unsigned int src1, unsigned int src2) {
+int s1 = (int) src1;
+int s2 = (int) src2;
+s1 = util_shift_right_saturate_n (s1, -1, 32);
+s2 = util_shift_right_saturate_n (s2, -1, 32);
+return (unsigned int) (s1 & 0xffff0000) | (unsigned int) ((s2 & 0xffff0000) >>16);
+}
+
+
+static inline unsigned int _rotl (unsigned int src1, unsigned int src2)
+{
+  src2 &= 0x1f;
+  return (src1 << src2) | (src1 >> (32 - src2));
+}
+
+
+static inline int _sadd(int src1, int src2) {
+signed long long res;
+signed long long maxv, minv;
+maxv = (1LL << (32-1)) - 1;
+minv = (-1LL << (32-1));
+res = (long long) src1 + (long long) src2;
+if (res > maxv) {
+	res = maxv;
+	_overflow = 1;
+	}
+else if (res < minv ) {
+	res = minv;
+	_overflow = 1;
+	}
+return (int) res;
+}
+
+static inline long long _saddsub(unsigned int src1, unsigned int src2) {
+int radd;
+signed long long rsub;
+
+signed long long maxv, minv;
+maxv = (1LL << (32-1)) - 1;
+minv = (-1LL << (32-1));
+
+radd = (int) src1 + (int) src2;
+
+//   saturate on subtract, not add
+
+
+rsub = (long long) ((int) src1) - (long long) ((int) src2);
+if (rsub > maxv) {
+	rsub = maxv;
+        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
+	/*  _overflow = 1; */
+	}
+else if (rsub < minv ) {
+	rsub = minv;
+        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
+	/*  _overflow = 1; */
+	}
+
+return  (((unsigned long long) radd) << 32) |  ( rsub & 0x00000000ffffffff ) ;
+}
+
+
+
+static inline long long _saddsub2(unsigned int src1, unsigned int src2) {
+signed int radd[2];
+signed int rsub[2];
+signed short s1[2], s2[2];
+
+signed int maxv, minv;
+maxv = (1L << (16-1)) - 1;
+minv = (-1L << (16-1));
+
+*((int*)s1) = src1;
+*((int*)s2) = src2;
+
+radd[0] =  (int) s1[0] + (int) s2[0];
+radd[1] =  (int) s1[1] + (int) s2[1];
+
+rsub[0] =  (int) s1[0] - (int) s2[0];
+rsub[1] =  (int) s1[1] - (int) s2[1];
+
+if (radd[0] > maxv) {
+	radd[0] = maxv;
+        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
+	/*  _overflow = 1; */
+	}
+else if (radd[0] < minv ) {
+	radd[0] = minv;
+        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
+	/*  _overflow = 1; */
+	}
+
+if (radd[1] > maxv) {
+	radd[1] = maxv;
+        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
+	/*  _overflow = 1; */
+	}
+else if (radd[1] < minv ) {
+	radd[1] = minv;
+        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
+	/*  _overflow = 1; */
+	}
+
+
+if (rsub[0] > maxv) {
+	rsub[0] = maxv;
+        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
+	/*  _overflow = 1; */
+	}
+else if (rsub[0] < minv ) {
+	rsub[0] = minv;
+        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
+	/*  _overflow = 1; */
+	}
+
+if (rsub[1] > maxv) {
+	rsub[1] = maxv;
+        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
+	/*  _overflow = 1; */
+	}
+else if (rsub[1] < minv ) {
+	rsub[1] = minv;
+        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
+	/*  _overflow = 1; */
+	}
+
+
+return  ((((unsigned long long) radd[1]) & 0x000000000000ffff) << 48) | 
+        ((((unsigned long long) radd[0]) & 0x000000000000ffff) << 32) | 
+        ((((unsigned long long) rsub[1]) & 0x000000000000ffff) << 16) |
+        ((((unsigned long long) rsub[0]) & 0x000000000000ffff));
+}
+
+
+
+static inline  int _sadd2(int src1, int src2) {
+signed short s1[2], s2[2];
+signed int r[2], maxv, minv;
+
+maxv = (1L << (16-1)) - 1;
+minv = (-1L << (16-1));
+
+
+*((int*)s1) = src1;
+*((int*)s2) = src2;
+
+r[0] =  (int) s1[0] + (int) s2[0];
+r[1] =  (int) s1[1] + (int) s2[1];
+
+if (r[0] > maxv) {
+	r[0] = maxv;
+        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
+	/*  _overflow = 1; */
+	}
+else if (r[0] < minv ) {
+	r[0] = minv;
+        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
+	/*  _overflow = 1; */
+	}
+if (r[1] > maxv) {
+	r[1] = maxv;
+        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
+	/*  _overflow = 1; */
+	}
+else if (r[1] < minv ) {
+	r[1] = minv;
+        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
+	/*  _overflow = 1; */
+	}
+
+return ((r[1] & 0xffff) << 16 ) | (r[0] & 0xffff) ;
+}
+
+
+static inline int _saddus2(unsigned int src1, int src2) {
+int res0, res1;
+  unsigned int s1_0 = (src1 & 0xffff);
+  unsigned int s1_1 = (src1 & 0xffff0000) >> 16;
+
+  short s2_0 = (src2 & 0xffff);
+  short s2_1 = (src2 & 0xffff0000) >> 16;
+
+  res0 = s1_0 + s2_0;
+  res1 = s1_1 + s2_1;
+
+  if (res0 >= 0x10000)
+    res0 = 0xffff;
+  else if (res0 < 0)
+    res0 = 0;
+  
+  if (res1 >= 0x10000)
+    res1 = 0xffff;
+  else if (res1 < 0)
+    res1 = 0;
+  
+  return (res1 << 16) | res0;
+}
+
+
+static inline unsigned int _saddu4(unsigned int src1, unsigned int src2) {
+unsigned int res0, res1, res2, res3;
+  unsigned int s1_0 = (src1 & 0xff);
+  unsigned int s1_1 = (src1 & 0xff00) >> 8;
+  unsigned int s1_2 = (src1 & 0xff0000) >> 16;
+  unsigned int s1_3 = (src1 & 0xff000000) >> 24;
+
+  unsigned int s2_0 = (src2 & 0xff);
+  unsigned int s2_1 = (src2 & 0xff00) >> 8;
+  unsigned int s2_2 = (src2 & 0xff0000) >> 16;
+  unsigned int s2_3 = (src2 & 0xff000000) >> 24;
+
+  res0 = s1_0 + s2_0;
+  res1 = s1_1 + s2_1;
+  res2 = s1_2 + s2_2;
+  res3 = s1_3 + s2_3;
+
+  if (res0 >= 0x100)
+    res0 = 0xff;
+  
+  if (res1 >= 0x100)
+    res1 = 0xff;
+  
+  if (res2 >= 0x100)
+    res2 = 0xff;
+  
+  if (res3 >= 0x100)
+    res3 = 0xff;
+
+  return (res3 << 24) | (res2 << 16) | (res1 << 8) | res0;
+
+}
+
+
+
+static inline int _sat(C6X_COMPAT_LONG40 src2)
+{
+  long long maxv = (1LL << (32-1)) - 1;
+  long long minv = (-1LL << (32-1));
+
+  int hi = (int)(src2 >> 32);
+  int lo = (int)src2;
+  long long temp = (unsigned long long)(unsigned)lo | (unsigned long long)hi << 32;
+  temp = orig_L40_set(temp);
+  
+  if (temp > maxv) {
+	temp = maxv;
+	_overflow = 1;
+	}
+  else if (temp < minv) {
+	temp = minv;
+	_overflow = 1;
+	}
+  return (int) temp;
+}
+
+static inline unsigned int _set(unsigned int src2, unsigned int csta, unsigned int cstb)
+{
+  csta &= 0x1f;
+  cstb &= 0x1f;
+  if (csta > cstb)
+    return src2;
+  else {
+    unsigned int mask =  (((1 <<  (cstb -  csta)) <<  1) -  1) <<  csta;
+    return src2 | mask;
+  }
+}
+
+static inline unsigned int _setr(unsigned int src2, int src1)
+{
+  unsigned int csta = (src1 >> 5) & 0x1f;
+  unsigned int cstb = src1 & 0x1f;
+  if (csta > cstb)
+    return src2;
+  else {
+    unsigned int mask =  (((1 <<  (cstb -  csta)) <<  1) -  1) <<  csta;
+    return src2 | mask;
+  }
+}
+
+
+static inline unsigned int _shfl (unsigned int src2)
+{
+  unsigned short lo = src2;
+  unsigned short hi = src2 >> 16;
+  unsigned int result = 0;
+  int i;
+  for (i = 0; i < 32; i+= 2) {
+    result >>= 1;
+    result |= (lo & 0x1) << 31;
+    lo >>= 1;
+    result >>= 1;
+    result |= (hi & 0x1) << 31;
+    hi >>= 1;
+  }
+  return result;
+}
+
+static inline long long _shfl3 (unsigned int src1, unsigned int src2)
+{
+  unsigned short lo = src2;
+  unsigned short hi = src1 >> 16;
+  unsigned short mid = src1;
+  unsigned long long result = 0;
+  int i;
+  for (i = 0; i < 32; i+= 2) {
+    result >>= 1;
+    result |= ((unsigned long long) (lo & 0x1)) << 47;
+    lo >>= 1;
+    result >>= 1;
+    result |= ((unsigned long long) (mid & 0x1)) << 47;
+    mid >>= 1;
+    result >>= 1;
+    result |= ((unsigned long long) (hi & 0x1)) << 47;
+    hi >>= 1;
+  }
+  return result;
+}
+
+
+
+static inline unsigned int _shlmb (unsigned int src1, unsigned int src2)
+{
+  return (src2 << 8) | (src1 >> 24);
+}
+
+static inline unsigned int _shrmb (unsigned int src1, unsigned int src2)
+{
+  return (src2 >> 8) | (src1 << 24);
+}
+
+
+static inline unsigned int _shru2 (unsigned int src1, unsigned int src2) {
+unsigned short hs1 = src1 >> 16;
+  unsigned short ls1 = src1 & 0xFFFF;
+  hs1 >>= src2;
+  ls1 >>= src2;
+  return (hs1 << 16) | ls1;
+}
+
+
+static inline int _shr2 (int src1, unsigned int src2) {
+  short s1[2], result[2];
+  *((int*)s1) = src1;
+  src2 = src2 & 31;
+  result[0] = (int)s1[0] >> src2;
+  result[1] = (int)s1[1] >> src2;
+
+  return *(int*)result;
+}
+
+
+static inline int _smpy (int src1, int src2) {
+unsigned long long result;
+result =  (((short) src1 * (short) src2) << 1);
+
+if ((result & 0xffffffff) == 0x80000000){
+    result = 0x7fffffff;
+    _overflow = 1;
+  }
+return (int) (result);
+}
+
+static inline int _smpyh (int src1, int src2) {
+unsigned long long result;
+result =  ((short) (src1 >> 16) * (short) (src2 >> 16)) << 1;
+if ((result & 0xffffffff) == 0x80000000){
+    result = 0x7fffffff;
+    _overflow = 1;
+  }
+return (int) (result);
+}
+
+static inline int _smpyhl (int src1, int src2) {
+unsigned long long result;
+result = ((short) (src1 >> 16) * (short) (src2)) << 1;
+if ((result & 0xffffffff) == 0x80000000){
+    result = 0x7fffffff;
+    _overflow = 1;
+  }
+return (int) (result);
+}
+
+static inline int _smpylh (int src1, int src2) {
+unsigned long long result;
+result = ((short) (src1) * (short) (src2 >> 16)) << 1;
+if ((result & 0xffffffff) == 0x80000000){
+    result = 0x7fffffff;
+    _overflow = 1;
+  }
+return (int) (result);
+}
+
+static inline long long _smpy2ll (int src1,  int src2) {
+  short ls1 = src1 & 0XFFFF;
+  short hs1 = src1 >> 16;
+  short ls2 = src2 & 0XFFFF;
+  short hs2 = src2 >> 16;
+
+  unsigned long long hi = (hs1 * hs2) << 1;
+  unsigned long long lo = ((ls1 * ls2) << 1) & 0xFFFFFFFF;
+  if ((hi & 0xffffffff) == 0x80000000){
+    hi = 0x7fffffff;
+    _overflow = 1;
+  }
+
+  if ((lo & 0xffffffff) == 0x80000000){
+    lo = 0x7fffffff;
+    _overflow = 1;
+  }
+
+  return (hi << 32) | lo;
+}
+
+
+
+
+static inline int _smpy32(int src1, int src2)
+{
+  long long res = (long long) src1 * src2;
+  res <<= 1;
+  res >>= 32;
+  return res;
+}
+
+static inline unsigned char TEN_satu8 (short src)
+{
+  if (src > 0xff)
+    return 0xff;
+  else if (src < 0)
+    return 0;
+  else
+    return src;
+}
+
+static inline int _spack2 (int src1, int src2) {
+short s1 = (short) util_saturate_n_no_state(src1,16);
+short s2 = (short) util_saturate_n_no_state(src2,16);
+return  ( (unsigned int) s1 << 16) | (((int) s2) & 0xFFFF);
+}
+
+
+static inline unsigned int _spacku4 (int src1, int src2) {
+  short lolo = src2;
+  short lohi = src2 >> 16;
+  short hilo = src1;
+  short hihi = src1 >> 16;
+
+  lolo = TEN_satu8(lolo);
+  lohi = TEN_satu8(lohi);
+  hilo = TEN_satu8(hilo);
+  hihi = TEN_satu8(hihi);
+
+  return (((unsigned int) hihi) <<  24) | (((unsigned int) hilo) << 16) | (lohi << 8) | lolo;
+}
+
+
+
+static inline int _sshl (int src1, unsigned int src2) {
+short local2 = (short)(src2 & 0x7FFF);
+return (int) util_shift_right_saturate_n(src1, -local2, 32);
+}
+
+
+
+
+static inline int _sshvl (int src2, int src1) {
+  short s1;
+  if (src1 > 31)
+    s1 = 31;
+  else if (src1 < -31)
+    s1 = -31;
+  else
+    s1 = src1;
+
+  return (int) util_shift_right_saturate_n(src2, -s1, 32);
+}
+
+
+
+
+
+static inline int _sshvr (int src2, int src1) {
+short s1;
+  if (src1 > 31)
+    s1 = 31;
+  else if (src1 < -31)
+    s1 = -31;
+  else
+    s1 = src1;
+  return (int) util_shift_right_saturate_n(src2, s1, 32);
+}
+
+
+
+
+static inline int _ssub(int src1, int src2) {
+signed long long res;
+signed long long maxv, minv;
+maxv = (1LL << (32-1)) - 1;
+minv = (-1LL << (32-1));
+res = (long long) src1 - (long long) src2;
+if (res > maxv) {
+	res = maxv;
+	_overflow = 1;
+	}
+else if (res < minv ) {
+	res = minv;
+	_overflow = 1;
+	}
+return (int) res;
+}
+
+static inline int _ssub2(int src1, int src2) {
+signed short s1[2], s2[2];
+signed int r[2], maxv, minv;
+
+maxv = (1L << (16-1)) - 1;
+minv = (-1L << (16-1));
+
+
+*((int*)s1) = src1;
+*((int*)s2) = src2;
+
+r[0] =  (int) s1[0] - (int) s2[0];
+r[1] =  (int) s1[1] - (int) s2[1];
+
+if (r[0] > maxv) {
+	r[0] = maxv;
+        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
+	/*  _overflow = 1; */
+	}
+else if (r[0] < minv ) {
+	r[0] = minv;
+        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
+	/*  _overflow = 1; */
+	}
+if (r[1] > maxv) {
+	r[1] = maxv;
+        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
+	/*  _overflow = 1; */
+	}
+else if (r[1] < minv ) {
+	r[1] = minv;
+        /* NOTE:   TI c6x does NOT set the overflow register even if results saturate */
+	/*  _overflow = 1; */
+	}
+
+return ((r[1] & 0xffff) << 16 ) | (r[0] & 0xffff) ;
+}
+
+
+static inline int _subabs4 (int src1, int src2) {
+  int res0, res1, res2, res3;
+  unsigned int s1_0 = (src1 & 0xff);
+  unsigned int s1_1 = (src1 & 0xff00) >> 8;
+  unsigned int s1_2 = (src1 & 0xff0000) >> 16;
+  unsigned int s1_3 = (src1 & 0xff000000) >> 24;
+
+  unsigned int s2_0 = (src2 & 0xff);
+  unsigned int s2_1 = (src2 & 0xff00) >> 8;
+  unsigned int s2_2 = (src2 & 0xff0000) >> 16;
+  unsigned int s2_3 = (src2 & 0xff000000) >> 24;
+
+  res0 = s1_0 - s2_0;
+  res1 = s1_1 - s2_1;
+  res2 = s1_2 - s2_2;
+  res3 = s1_3 - s2_3;
+
+  if (res0 < 0)
+    res0 = -res0;
+  
+  if (res1 < 0)
+    res1 = -res1;
+  
+  if (res2 < 0)
+    res2 = -res2;
+  
+  if (res3 < 0)
+    res3 = -res3;
+
+  return (res3 << 24) | (res2 << 16) | (res1 << 8) | res0;
+}
+
+
+static inline unsigned int _subc (unsigned int src1, unsigned int src2)
+{
+  if ( src1 >=  src2)
+    return ((src1 - src2) <<  1) +  1;
+  else
+    return src1 << 1;
+}
+
+
+
+static inline int _sub2(int src1, int src2) {
+	short s1[2], s2[2], r[2];
+	int result;
+	*((int*)s1) = src1;
+	*((int*)s2) = src2;
+	r[0] = s1[0] - s2[0];
+	r[1] = s1[1] - s2[1];
+	result = *(int*)r;
+	return result;
+}
+
+
+static inline int _sub4(int src1, int src2) {
+	char c1[4], c2[4], r[4];
+	int result;
+	*((int*)c1) = src1;
+	*((int*)c2) = src2;
+	r[0] = c1[0] - c2[0];
+	r[1] = c1[1] - c2[1];
+	r[2] = c1[2] - c2[2];
+	r[3] = c1[3] - c2[3];
+	result = *(int*)r;
+	return result;
+}
+
+
+static inline int _swap4 (unsigned int src1) {
+	unsigned char v0 = src1;
+	unsigned char v1 = src1 >> 8;
+	unsigned char v2 = src1 >> 16;
+	unsigned char v3 = src1 >> 24;
+	unsigned v = v0<<8 | v1 | v2<<24 | v3<<16;
+	return v;
+}
+
+static inline unsigned int _unpkhu4 (unsigned int src1) {
+	unsigned v0 = src1>>24;
+	unsigned v1 = (src1>>16) & 0xff;
+	return (v0<<16) | v1;
+}
+
+static inline unsigned int _unpklu4 (unsigned int src1) {
+	unsigned v1 = (src1>>8) & 0xff;
+	unsigned v0 = (src1) & 0xff;
+	return (v1<<16) | v0;
+}
+
+
+
+
+static inline unsigned int _xpnd2 (unsigned int src1) {
+      int v0 = (src1 & 0x1) ? 0x0000ffff : 0x00000000;
+      int v1 = (src1 & 0x2) ? 0xffff0000 : 0x00000000;
+      return v0|v1;
+}
+
+static inline unsigned int _xpnd4 (unsigned int src1) {
+      int v0 = (src1 & 0x1) ? 0x000000ff : 0x00000000;
+      int v1 = (src1 & 0x2) ? 0x0000ff00 : 0x00000000;
+      int v2 = (src1 & 0x4) ? 0x00ff0000 : 0x00000000;
+      int v3 = (src1 & 0x8) ? 0xff000000 : 0x00000000;
+      int r = v0|v1|v2|v3;
+      return r;
+}
+
+
+
+//     end of Implemented in alphabetical order
+
+
+#endif /* __C6X_COMPAT__H */