bwaPPC64.patch

diff -Naur /gpfs/gpfs_8mb/cheng/BWA/jbwa/bwa-8e2da1e407972170d1a660286f07a3a3a71ee6fb/ksw.c /gpfs/gpfs_8mb/cheng/BWA/jbwa/bwa-8e2da1e407972170d1a660286f07a3a3a71ee6fbPPC/ksw.c
--- /gpfs/gpfs_8mb/cheng/BWA/jbwa/bwa-8e2da1e407972170d1a660286f07a3a3a71ee6fb/ksw.c	2016-02-01 11:52:54.000000000 -0500
+++ /gpfs/gpfs_8mb/cheng/BWA/jbwa/bwa-8e2da1e407972170d1a660286f07a3a3a71ee6fbPPC/ksw.c	2016-06-09 13:35:57.113887000 -0400
@@ -26,7 +26,11 @@
 #include <stdlib.h>
 #include <stdint.h>
 #include <assert.h>
+#ifdef __PPC64__
+#include "vec128int.h"
+#else
 #include <emmintrin.h>
+#endif
 #include "ksw.h"
 
 #ifdef USE_MALLOC_WRAPPERS
@@ -115,38 +117,111 @@
 	__m128i zero, oe_del, e_del, oe_ins, e_ins, shift, *H0, *H1, *E, *Hmax;
 	kswr_t r;
 
+#ifdef __PPC64__
+#define __max_16(ret, xx) do { \
+                (xx) = vec_max16ub((xx), vec_shiftrightbytes1q((xx), 8)); \
+                (xx) = vec_max16ub((xx), vec_shiftrightbytes1q((xx), 4)); \
+                (xx) = vec_max16ub((xx), vec_shiftrightbytes1q((xx), 2)); \
+                (xx) = vec_max16ub((xx), vec_shiftrightbytes1q((xx), 1)); \
+        (ret) = vec_extract8sh((xx), 0) & 0x00ff; \
+        } while (0)
+#else
 #define __max_16(ret, xx) do { \
-		(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \
-		(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 4)); \
-		(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 2)); \
-		(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 1)); \
-    	(ret) = _mm_extract_epi16((xx), 0) & 0x00ff; \
-	} while (0)
+                (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \
+                (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 4)); \
+                (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 2)); \
+                (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 1)); \
+        (ret) = _mm_extract_epi16((xx), 0) & 0x00ff; \
+        } while (0)
+#endif
 
 	// initialization
 	r = g_defr;
 	minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
 	endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
 	m_b = n_b = 0; b = 0;
-	zero = _mm_set1_epi32(0);
+#ifdef __PPC64__
+	zero = vec_splat4sw(0);
+#else
+	zero = _mm_set1_epi32(0);    /* !!!REP NOT FOUND!!! */ 
+#endif
+#ifdef __PPC64__
+	oe_del = vec_splat16sb(_o_del + _e_del);
+#else
 	oe_del = _mm_set1_epi8(_o_del + _e_del);
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+	e_del = vec_splat16sb(_e_del);
+#else
 	e_del = _mm_set1_epi8(_e_del);
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+	oe_ins = vec_splat16sb(_o_ins + _e_ins);
+#else
 	oe_ins = _mm_set1_epi8(_o_ins + _e_ins);
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+	e_ins = vec_splat16sb(_e_ins);
+#else
 	e_ins = _mm_set1_epi8(_e_ins);
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+	shift = vec_splat16sb(q->shift);
+#else
 	shift = _mm_set1_epi8(q->shift);
+   /* NEED INSPECTION */ 
+#endif
 	H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
 	slen = q->slen;
 	for (i = 0; i < slen; ++i) {
+#ifdef __PPC64__
+		vec_store1q(E + i, zero);
+#else
 		_mm_store_si128(E + i, zero);
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+		vec_store1q(H0 + i, zero);
+#else
 		_mm_store_si128(H0 + i, zero);
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+		vec_store1q(Hmax + i, zero);
+#else
 		_mm_store_si128(Hmax + i, zero);
+   /* NEED INSPECTION */ 
+#endif
 	}
 	// the core loop
 	for (i = 0; i < tlen; ++i) {
 		int j, k, cmp, imax;
 		__m128i e, h, t, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
+#ifdef __PPC64__
+		h = vec_load1q(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
+#else
 		h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
-		h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian
+   /* NEED INSPECTION */ 
+#endif
+		#ifdef __BIG_ENDIAN__
+#ifdef __PPC64__
+			h = vec_shiftrightbytes1q(h, 1);	
+#else
+			h = _mm_srli_si128(h, 1);	
+   /* NEED INSPECTION */ 
+#endif
+		#else
+#ifdef __PPC64__
+			h = vec_shiftleftbytes1q(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian
+#else
+			h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian
+   /* NEED INSPECTION */ 
+#endif
+		#endif
 		for (j = 0; LIKELY(j < slen); ++j) {
 			/* SW cells are computed in the following order:
 			 *   H(i,j)   = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
@@ -154,35 +229,154 @@
 			 *   F(i,j+1) = max{H(i,j)-q, F(i,j)-r}
 			 */
 			// compute H'(i,j); note that at the beginning, h=H'(i-1,j-1)
+#ifdef __PPC64__
+			h = vec_addsaturating16ub(h, vec_load1q(S + j));
+#else
 			h = _mm_adds_epu8(h, _mm_load_si128(S + j));
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+			h = vec_subtractsaturating16ub(h, shift); // h=H'(i-1,j-1)+S(i,j)
+#else
 			h = _mm_subs_epu8(h, shift); // h=H'(i-1,j-1)+S(i,j)
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+			e = vec_load1q(E + j); // e=E'(i,j)
+#else
 			e = _mm_load_si128(E + j); // e=E'(i,j)
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+			h = vec_max16ub(h, e);
+#else
 			h = _mm_max_epu8(h, e);
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+			h = vec_max16ub(h, f); // h=H'(i,j)
+#else
 			h = _mm_max_epu8(h, f); // h=H'(i,j)
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+			max = vec_max16ub(max, h); // set max
+#else
 			max = _mm_max_epu8(max, h); // set max
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+			vec_store1q(H1 + j, h); // save to H'(i,j)
+#else
 			_mm_store_si128(H1 + j, h); // save to H'(i,j)
+   /* NEED INSPECTION */ 
+#endif
 			// now compute E'(i+1,j)
+#ifdef __PPC64__
+			e = vec_subtractsaturating16ub(e, e_del); // e=E'(i,j) - e_del
+#else
 			e = _mm_subs_epu8(e, e_del); // e=E'(i,j) - e_del
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+			t = vec_subtractsaturating16ub(h, oe_del); // h=H'(i,j) - o_del - e_del
+#else
 			t = _mm_subs_epu8(h, oe_del); // h=H'(i,j) - o_del - e_del
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+			e = vec_max16ub(e, t); // e=E'(i+1,j)
+#else
 			e = _mm_max_epu8(e, t); // e=E'(i+1,j)
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+			vec_store1q(E + j, e); // save to E'(i+1,j)
+#else
 			_mm_store_si128(E + j, e); // save to E'(i+1,j)
+   /* NEED INSPECTION */ 
+#endif
 			// now compute F'(i,j+1)
+#ifdef __PPC64__
+			f = vec_subtractsaturating16ub(f, e_ins);
+#else
 			f = _mm_subs_epu8(f, e_ins);
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+			t = vec_subtractsaturating16ub(h, oe_ins); // h=H'(i,j) - o_ins - e_ins
+#else
 			t = _mm_subs_epu8(h, oe_ins); // h=H'(i,j) - o_ins - e_ins
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+			f = vec_max16ub(f, t);
+#else
 			f = _mm_max_epu8(f, t);
+   /* NEED INSPECTION */ 
+#endif
 			// get H'(i-1,j) and prepare for the next j
+#ifdef __PPC64__
+			h = vec_load1q(H0 + j); // h=H'(i-1,j)
+#else
 			h = _mm_load_si128(H0 + j); // h=H'(i-1,j)
+   /* NEED INSPECTION */ 
+#endif
 		}
 		// NB: we do not need to set E(i,j) as we disallow adjecent insertion and then deletion
 		for (k = 0; LIKELY(k < 16); ++k) { // this block mimics SWPS3; NB: H(i,j) updated in the lazy-F loop cannot exceed max
-			f = _mm_slli_si128(f, 1);
+			#ifdef __BIG_ENDIAN__
+#ifdef __PPC64__
+				f = vec_shiftrightbytes1q(f, 1);
+#else
+				f = _mm_srli_si128(f, 1);
+   /* NEED INSPECTION */ 
+#endif
+			#else
+#ifdef __PPC64__
+				f = vec_shiftleftbytes1q(f, 1);
+#else
+				f = _mm_slli_si128(f, 1);
+   /* NEED INSPECTION */ 
+#endif
+			#endif
 			for (j = 0; LIKELY(j < slen); ++j) {
+#ifdef __PPC64__
+				h = vec_load1q(H1 + j);
+#else
 				h = _mm_load_si128(H1 + j);
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+				h = vec_max16ub(h, f); // h=H'(i,j)
+#else
 				h = _mm_max_epu8(h, f); // h=H'(i,j)
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+				vec_store1q(H1 + j, h);
+#else
 				_mm_store_si128(H1 + j, h);
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+				h = vec_subtractsaturating16ub(h, oe_ins);
+#else
 				h = _mm_subs_epu8(h, oe_ins);
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+				f = vec_subtractsaturating16ub(f, e_ins);
+#else
 				f = _mm_subs_epu8(f, e_ins);
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+				cmp = vec_extractupperbit16sb(vec_compareeq16sb(vec_subtractsaturating16ub(f, h), zero));
+#else
 				cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_subs_epu8(f, h), zero));
+   /* NEED INSPECTION */ 
+#endif
 				if (UNLIKELY(cmp == 0xffff)) goto end_loop16;
 			}
 		}
@@ -201,7 +395,12 @@
 		if (imax > gmax) {
 			gmax = imax; te = i; // te is the end position on the target
 			for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector
+#ifdef __PPC64__
+				vec_store1q(Hmax + j, vec_load1q(H1 + j));
+#else
 				_mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
+   /* NEED INSPECTION */ 
+#endif
 			if (gmax + q->shift >= 255 || gmax >= endsc) break;
 		}
 		S = H1; H1 = H0; H0 = S; // swap H0 and H1
@@ -236,61 +435,242 @@
 	__m128i zero, oe_del, e_del, oe_ins, e_ins, *H0, *H1, *E, *Hmax;
 	kswr_t r;
 
+#ifdef __PPC64__
 #define __max_8(ret, xx) do { \
-		(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \
-		(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \
-		(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \
-    	(ret) = _mm_extract_epi16((xx), 0); \
-	} while (0)
+                (xx) = vec_max8sh((xx), vec_shiftrightbytes1q((xx), 8)); \
+                (xx) = vec_max8sh((xx), vec_shiftrightbytes1q((xx), 4)); \
+                (xx) = vec_max8sh((xx), vec_shiftrightbytes1q((xx), 2)); \
+        (ret) = vec_extract8sh((xx), 0); \
+        } while (0)
+#else
+#define __max_8(ret, xx) do { \
+                (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \
+                (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \
+                (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \
+        (ret) = _mm_extract_epi16((xx), 0); \
+        } while (0)
+#endif
 
 	// initialization
 	r = g_defr;
 	minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
 	endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
 	m_b = n_b = 0; b = 0;
-	zero = _mm_set1_epi32(0);
+#ifdef __PPC64__
+	zero = vec_splat4sw(0);
+#else
+	zero = _mm_set1_epi32(0);    /* !!!REP NOT FOUND!!! */ 
+#endif
+#ifdef __PPC64__
+	oe_del = vec_splat8sh(_o_del + _e_del);
+#else
 	oe_del = _mm_set1_epi16(_o_del + _e_del);
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+	e_del = vec_splat8sh(_e_del);
+#else
 	e_del = _mm_set1_epi16(_e_del);
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+	oe_ins = vec_splat8sh(_o_ins + _e_ins);
+#else
 	oe_ins = _mm_set1_epi16(_o_ins + _e_ins);
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+	e_ins = vec_splat8sh(_e_ins);
+#else
 	e_ins = _mm_set1_epi16(_e_ins);
+   /* NEED INSPECTION */ 
+#endif
 	H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
 	slen = q->slen;
 	for (i = 0; i < slen; ++i) {
+#ifdef __PPC64__
+		vec_store1q(E + i, zero);
+#else
 		_mm_store_si128(E + i, zero);
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+		vec_store1q(H0 + i, zero);
+#else
 		_mm_store_si128(H0 + i, zero);
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+		vec_store1q(Hmax + i, zero);
+#else
 		_mm_store_si128(Hmax + i, zero);
+   /* NEED INSPECTION */ 
+#endif
 	}
 	// the core loop
 	for (i = 0; i < tlen; ++i) {
 		int j, k, imax;
 		__m128i e, t, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
+#ifdef __PPC64__
+		h = vec_load1q(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
+#else
 		h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
-		h = _mm_slli_si128(h, 2);
+   /* NEED INSPECTION */ 
+#endif
+		#ifdef __BIG_ENDIAN__
+#ifdef __PPC64__
+			h = vec_shiftrightbytes1q(h, 2);
+#else
+			h = _mm_srli_si128(h, 2);
+   /* NEED INSPECTION */ 
+#endif
+		#else
+#ifdef __PPC64__
+			h = vec_shiftleftbytes1q(h, 2);
+#else
+			h = _mm_slli_si128(h, 2);
+   /* NEED INSPECTION */ 
+#endif
+		#endif
 		for (j = 0; LIKELY(j < slen); ++j) {
+#ifdef __PPC64__
+			h = vec_addsaturating8sh(h, *S++);
+#else
 			h = _mm_adds_epi16(h, *S++);
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+			e = vec_load1q(E + j);
+#else
 			e = _mm_load_si128(E + j);
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+			h = vec_max8sh(h, e);
+#else
 			h = _mm_max_epi16(h, e);
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+			h = vec_max8sh(h, f);
+#else
 			h = _mm_max_epi16(h, f);
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+			max = vec_max8sh(max, h);
+#else
 			max = _mm_max_epi16(max, h);
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+			vec_store1q(H1 + j, h);
+#else
 			_mm_store_si128(H1 + j, h);
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+			e = vec_subtractsaturating8uh(e, e_del);
+#else
 			e = _mm_subs_epu16(e, e_del);
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+			t = vec_subtractsaturating8uh(h, oe_del);
+#else
 			t = _mm_subs_epu16(h, oe_del);
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+			e = vec_max8sh(e, t);
+#else
 			e = _mm_max_epi16(e, t);
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+			vec_store1q(E + j, e);
+#else
 			_mm_store_si128(E + j, e);
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+			f = vec_subtractsaturating8uh(f, e_ins);
+#else
 			f = _mm_subs_epu16(f, e_ins);
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+			t = vec_subtractsaturating8uh(h, oe_ins);
+#else
 			t = _mm_subs_epu16(h, oe_ins);
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+			f = vec_max8sh(f, t);
+#else
 			f = _mm_max_epi16(f, t);
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+			h = vec_load1q(H0 + j);
+#else
 			h = _mm_load_si128(H0 + j);
+   /* NEED INSPECTION */ 
+#endif
 		}
 		for (k = 0; LIKELY(k < 16); ++k) {
-			f = _mm_slli_si128(f, 2);
+			#ifdef __BIG_ENDIAN__
+#ifdef __PPC64__
+				f = vec_shiftrightbytes1q(f, 2);
+#else
+				f = _mm_srli_si128(f, 2);
+   /* NEED INSPECTION */ 
+#endif
+			#else
+#ifdef __PPC64__
+				f = vec_shiftleftbytes1q(f, 2);
+#else
+				f = _mm_slli_si128(f, 2);
+   /* NEED INSPECTION */ 
+#endif
+			#endif
 			for (j = 0; LIKELY(j < slen); ++j) {
+#ifdef __PPC64__
+				h = vec_load1q(H1 + j);
+#else
 				h = _mm_load_si128(H1 + j);
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+				h = vec_max8sh(h, f);
+#else
 				h = _mm_max_epi16(h, f);
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+				vec_store1q(H1 + j, h);
+#else
 				_mm_store_si128(H1 + j, h);
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+				h = vec_subtractsaturating8uh(h, oe_ins);
+#else
 				h = _mm_subs_epu16(h, oe_ins);
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+				f = vec_subtractsaturating8uh(f, e_ins);
+#else
 				f = _mm_subs_epu16(f, e_ins);
+   /* NEED INSPECTION */ 
+#endif
+#ifdef __PPC64__
+				if(UNLIKELY(!vec_extractupperbit16sb(vec_comparegt8sh(f, h)))) goto end_loop8;
+#else
 				if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(f, h)))) goto end_loop8;
+   /* NEED INSPECTION */ 
+#endif
 			}
 		}
 end_loop8:
@@ -307,7 +687,12 @@
 		if (imax > gmax) {
 			gmax = imax; te = i;
 			for (j = 0; LIKELY(j < slen); ++j)
+#ifdef __PPC64__
+				vec_store1q(Hmax + j, vec_load1q(H1 + j));
+#else
 				_mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
+   /* NEED INSPECTION */ 
+#endif
 			if (gmax >= endsc) break;
 		}
 		S = H1; H1 = H0; H0 = S;
diff -Naur /gpfs/gpfs_8mb/cheng/BWA/jbwa/bwa-8e2da1e407972170d1a660286f07a3a3a71ee6fb/vec128int.h /gpfs/gpfs_8mb/cheng/BWA/jbwa/bwa-8e2da1e407972170d1a660286f07a3a3a71ee6fbPPC/vec128int.h
--- /gpfs/gpfs_8mb/cheng/BWA/jbwa/bwa-8e2da1e407972170d1a660286f07a3a3a71ee6fb/vec128int.h	1969-12-31 19:00:00.000000000 -0500
+++ /gpfs/gpfs_8mb/cheng/BWA/jbwa/bwa-8e2da1e407972170d1a660286f07a3a3a71ee6fbPPC/vec128int.h	2016-06-09 13:35:39.944962000 -0400
@@ -0,0 +1,276 @@
+/******************************************************************************/
+/*                                                                            */
+/* Licensed Materials - Property of IBM                                       */
+/*                                                                            */
+/* IBM Power Vector Intrinisic Functions version 1.0.3                        */
+/*                                                                            */
+/* Copyright IBM Corp. 2014,2016                                              */
+/* US Government Users Restricted Rights - Use, duplication or                */
+/* disclosure restricted by GSA ADP Schedule Contract with IBM Corp.          */
+/*                                                                            */
+/* See the licence in the license subdirectory.                               */
+/*                                                                            */
+/* More information on this software is available on the IBM DeveloperWorks   */
+/* website at                                                                 */
+/*  https://www.ibm.com/developerworks/community/groups/community/powerveclib */
+/*                                                                            */
+/******************************************************************************/
+
+#ifndef _H_VEC128INT
+#define _H_VEC128INT
+
+#include <altivec.h>
+
+#define VECLIB_ALIGNED8 __attribute__ ((__aligned__ (8)))
+#define VECLIB_ALIGNED16 __attribute__ ((__aligned__ (16)))
+
+/* Control inlining */
+#ifdef NOT_ALWAYS_INLINE
+  #define VECLIB_INLINE
+#else
+  #define VECLIB_INLINE static inline __attribute__ ((__always_inline__))
+#endif
+#define VECLIB_NOINLINE static __attribute__ ((__noinline__))
+
+#define VECLIB_ALIGNED16 __attribute__ ((__aligned__ (16)))
+
+typedef
+  VECLIB_ALIGNED8
+  unsigned long long
+__m64;
+
+typedef
+  VECLIB_ALIGNED16
+  vector unsigned char
+__m128i;
+
+typedef
+  VECLIB_ALIGNED16
+  union {
+    __m128i                   as_m128i;
+    __m64                     as_m64               [2];
+    vector signed   char      as_vector_signed_char;
+    vector unsigned char      as_vector_unsigned_char;
+    vector bool     char      as_vector_bool_char;
+    vector signed   short     as_vector_signed_short;
+    vector unsigned short     as_vector_unsigned_short;
+    vector bool     short     as_vector_bool_short;
+    vector signed   int       as_vector_signed_int;
+    vector unsigned int       as_vector_unsigned_int;
+    vector bool     int       as_vector_bool_int;
+    vector signed   long long as_vector_signed_long_long;
+    vector unsigned long long as_vector_unsigned_long_long;
+    vector bool     long long as_vector_bool_long_long;
+    char                      as_char              [16];
+    short                     as_short             [8];
+    int                       as_int               [4];
+    unsigned int              as_unsigned_int      [4];
+    long long                 as_long_long         [2];
+  } __m128i_union;
+
+typedef const long intlit3;  /* 3 bit int literal */
+typedef const long intlit8;  /* 8 bit int literal */
+
+/******************************************************** Load ********************************************************/
+
+/* Load 128-bits of integer data, aligned */
+VECLIB_INLINE __m128i vec_load1q (__m128i const* address)
+{
+  return (__m128i) vec_ld (0, (vector unsigned char*) address);
+}
+
+/******************************************************** Set *********************************************************/
+
+/* Splat 8-bit char to 16 8-bit chars */
+VECLIB_INLINE __m128i vec_splat16sb (char scalar)
+{ return (__m128i) vec_splats ((signed char) scalar); }
+
+/* Splat 16-bit short to 8 16-bit shorts */
+VECLIB_INLINE __m128i vec_splat8sh (short scalar)
+{ return (__m128i) vec_splats (scalar); }
+
+/* Splat 32-bit int to 4 32-bit ints */
+VECLIB_INLINE __m128i vec_splat4sw (int scalar)
+{ return (__m128i) vec_splats (scalar); }
+
+/******************************************************** Store *******************************************************/
+
+/* Store 128-bits integer, aligned */
+VECLIB_INLINE void vec_store1q (__m128i* address, __m128i v)
+{ vec_st (v, 0, address); }
+
+
+/******************************************************* Extract ******************************************************/
+
+/* Extract upper bit of 16 8-bit chars */
+VECLIB_INLINE int vec_extractupperbit16sb (__m128i v)
+{
+  __m128i_union t;
+  t.as_m128i = v;
+  int result = 0;
+  #ifdef __LITTLE_ENDIAN__
+    result |= (t.as_char[15] & 0x80) << (15-7);
+    result |= (t.as_char[14] & 0x80) << (14-7);
+    result |= (t.as_char[13] & 0x80) << (13-7);
+    result |= (t.as_char[12] & 0x80) << (12-7);
+    result |= (t.as_char[11] & 0x80) << (11-7);
+    result |= (t.as_char[10] & 0x80) << (10-7);
+    result |= (t.as_char[9]  & 0x80) <<  (9-7);
+    result |= (t.as_char[8]  & 0x80) <<  (8-7);
+    result |= (t.as_char[7]  & 0x80);
+    result |= (t.as_char[6]  & 0x80) >>  (7-6);
+    result |= (t.as_char[5]  & 0x80) >>  (7-5);
+    result |= (t.as_char[4]  & 0x80) >>  (7-4);
+    result |= (t.as_char[3]  & 0x80) >>  (7-3);
+    result |= (t.as_char[2]  & 0x80) >>  (7-2);
+    result |= (t.as_char[1]  & 0x80) >>  (7-1);
+    result |= (t.as_char[0]  & 0x80) >>   7;
+  #elif __BIG_ENDIAN__
+    result |= (t.as_char[0]  & 0x80) << (15-7);
+    result |= (t.as_char[1]  & 0x80) << (14-7);
+    result |= (t.as_char[2]  & 0x80) << (13-7);
+    result |= (t.as_char[3]  & 0x80) << (12-7);
+    result |= (t.as_char[4]  & 0x80) << (11-7);
+    result |= (t.as_char[5]  & 0x80) << (10-7);
+    result |= (t.as_char[6]  & 0x80) <<  (9-7);
+    result |= (t.as_char[7]  & 0x80) <<  (8-7);
+    result |= (t.as_char[8]  & 0x80);
+    result |= (t.as_char[9]  & 0x80) >>  (7-6);
+    result |= (t.as_char[10] & 0x80) >>  (7-5);
+    result |= (t.as_char[11] & 0x80) >>  (7-4);
+    result |= (t.as_char[12] & 0x80) >>  (7-3);
+    result |= (t.as_char[13] & 0x80) >>  (7-2);
+    result |= (t.as_char[14] & 0x80) >>  (7-1);
+    result |= (t.as_char[15] & 0x80) >>   7;
+  #endif
+  return result;
+}
+
+/* Extract 16-bit short from one of 8 16-bit shorts */
+VECLIB_INLINE int vec_extract8sh (__m128i v, intlit3 element_from_right)
+{
+  __m128i_union t;
+  #ifdef __LITTLE_ENDIAN__
+    t.as_m128i = v;
+    return t.as_short[element_from_right & 7];
+  #elif __BIG_ENDIAN__
+    static const vector unsigned char permute_selector[8] = {
+    /* To extract specified halfword element into lowest halfword of the left half with other elements zeroed */
+      { 0x00,0x01, 0x02,0x03, 0x04,0x05, 0x1E,0x1F, 0x08,0x09, 0x0A,0x0B, 0x0C,0x0D, 0x0E,0x0F },  /* element 0 */
+      { 0x00,0x01, 0x02,0x03, 0x04,0x05, 0x1C,0x1D, 0x08,0x09, 0x0A,0x0B, 0x0C,0x0D, 0x0E,0x0F },  /* element 1 */
+      { 0x00,0x01, 0x02,0x03, 0x04,0x05, 0x1A,0x1B, 0x08,0x09, 0x0A,0x0B, 0x0C,0x0D, 0x0E,0x0F },  /* element 2 */
+      { 0x00,0x01, 0x02,0x03, 0x04,0x05, 0x18,0x19, 0x08,0x09, 0x0A,0x0B, 0x0C,0x0D, 0x0E,0x0F },  /* element 3 */
+      { 0x00,0x01, 0x02,0x03, 0x04,0x05, 0x16,0x17, 0x08,0x09, 0x0A,0x0B, 0x0C,0x0D, 0x0E,0x0F },  /* element 4 */
+      { 0x00,0x01, 0x02,0x03, 0x04,0x05, 0x14,0x15, 0x08,0x09, 0x0A,0x0B, 0x0C,0x0D, 0x0E,0x0F },  /* element 5 */
+      { 0x00,0x01, 0x02,0x03, 0x04,0x05, 0x12,0x13, 0x08,0x09, 0x0A,0x0B, 0x0C,0x0D, 0x0E,0x0F },  /* element 6 */
+      { 0x00,0x01, 0x02,0x03, 0x04,0x05, 0x10,0x11, 0x08,0x09, 0x0A,0x0B, 0x0C,0x0D, 0x0E,0x0F }   /* element 7 */
+    };
+    t.as_m128i = vec_perm (vec_splats ((unsigned char) 0), v, permute_selector[element_from_right & 7]);
+    return (short) t.as_long_long[__vsr_left_half_long_long_in_memory];
+  #endif
+}
+
+/***************************************************** Arithmetic *****************************************************/
+
+/* Add 16 8-bit chars with signed saturation */
+VECLIB_INLINE __m128i vec_addsaturating16sb (__m128i left, __m128i right)
+{
+  return (__m128i) vec_adds ((vector signed char) left, (vector signed char) right);
+}
+
+/* Add 16 8-bit chars with unsigned saturation */
+VECLIB_INLINE __m128i vec_addsaturating16ub (__m128i left, __m128i right)
+{ return (__m128i) vec_adds ((vector unsigned char) left, (vector unsigned char) right); }
+
+/* Add 8 16-bit shorts with signed saturation */
+VECLIB_INLINE __m128i vec_addsaturating8sh (__m128i left, __m128i right)
+{ return (__m128i) vec_adds ((vector signed short) left, (vector signed short) right); }
+
+/* Subtract 16 8-bit chars with unsigned saturation */
+VECLIB_INLINE __m128i vec_subtractsaturating16ub (__m128i left, __m128i right)
+{ return (__m128i) vec_subs ((vector unsigned char) left, (vector unsigned char) right); }
+
+/* Subtract 8 16-bit shorts with unsigned saturation */
+VECLIB_INLINE __m128i vec_subtractsaturating8uh (__m128i left, __m128i right)
+{ return (__m128i) vec_subs ((vector unsigned short) left, (vector unsigned short) right); }
+
+/* Max 8 16-bit shorts */
+VECLIB_INLINE __m128i vec_max8sh (__m128i left, __m128i right)
+{ return (__m128i) vec_max ((vector signed short) left, (vector signed short) right); }
+
+/* Max 16 8-bit unsigned chars */
+VECLIB_INLINE __m128i vec_max16ub (__m128i left, __m128i right)
+{ return (__m128i) vec_max ((vector unsigned char) left, (vector unsigned char) right); }
+
+#ifdef __LITTLE_ENDIAN__
+  #define LEleft_BEright left
+  #define LEright_BEleft right
+#elif __BIG_ENDIAN__
+  #define LEleft_BEright right
+  #define LEright_BEleft left
+#endif
+
+/****************************************************** Shift *********************************************************/
+
+/*- SSE2 shifts >= 32 produce 0; Altivec/MMX shifts by count%count_size. */
+/*- The Altivec spec says the element shift count vector register must have a shift count in each element */
+/*- and the shift counts may be different for each element. */
+/*- The PowerPC Architecture says all elements must contain the same shift count. That wins. */
+/*- The element shift count_size is: byte shift: 3 bits (0-7), halfword: 4 bits (0-15), word: 5 bits (0-31). */
+/*- For full vector shifts the Altivec/PowerPC bit shift count is in the rightmost 7 bits, */
+/*- with a 4 bit slo/sro byte shift count then a 3 bit sll/srl bit shift count. */
+
+/* Shift left */
+
+/* Shift 128-bits left logical immediate by bytes */
+VECLIB_INLINE __m128i vec_shiftleftbytes1q (__m128i v, intlit8 bytecount)
+{
+  if ((unsigned long) bytecount >= 16)
+  {
+    /* SSE2 shifts >= element_size or < 0 produce 0; Altivec/MMX shifts by bytecount%element_size. */
+    return (__m128i) vec_splats (0);
+  } else if (bytecount == 0) {
+    return v;
+  } else {
+    /* The PowerPC byte shift count must be multiplied by 8. */
+    /* It need not but can be replicated, which handles both LE and BE shift count positioning. */
+    __m128i_union replicated_count;
+    replicated_count.as_m128i = vec_splat16sb (bytecount << 3);
+    return (__m128i) vec_slo (v, replicated_count.as_m128i);
+  }
+}
+
+/* Shift right */
+
+/* Shift 128-bits right logical immediate by bytes */
+VECLIB_INLINE __m128i vec_shiftrightbytes1q (__m128i v, intlit8 bytecount)
+{
+  if ((unsigned long) bytecount >= 16)
+  {
+    /* SSE2 shifts >= element_size or < 0 produce 0; Altivec/MMX shifts by bytecount%element_size. */
+    return (__m128i) vec_splats (0);
+  } else if (bytecount == 0) {
+    return v;
+  } else {
+    /* The PowerPC byte shift count must be multiplied by 8. */
+    /* It need not but can be replicated, which handles both LE and BE shift count positioning. */
+    __m128i_union replicated_count;
+    replicated_count.as_m128i = vec_splat16sb (bytecount << 3);
+    /* AT gcc v7.1 may miscompile vec_sro as vec_slo? */
+    return (__m128i) vec_sro (v, replicated_count.as_m128i);
+  }
+}
+
+/******************************************************* Compare ******************************************************/
+
+/* Compare eq */
+
+/* Compare 16 8-bit chars for == to vector mask */
+VECLIB_INLINE __m128i vec_compareeq16sb (__m128i left, __m128i right)
+{ return (__m128i) vec_cmpeq ((vector signed char) left, (vector signed char) right); }
+
+/* Compare 8 16-bit shorts for > to vector mask */
+VECLIB_INLINE __m128i vec_comparegt8sh (__m128i left, __m128i right)
+{ return (__m128i) vec_cmpgt ((vector signed short) left, (vector signed short) right); }
+
+#endif